use of structures._Stn in project IR_Base by Linda-sunshine.
the class corrLDA_Gibbs method initialize_probability.
@Override
protected void initialize_probability(Collection<_Doc> collection) {
createSpace();
for (int i = 0; i < number_of_topics; i++) Arrays.fill(word_topic_sstat[i], d_beta);
Arrays.fill(m_sstat, d_beta * vocabulary_size);
for (_Doc d : collection) {
if (d instanceof _ParentDoc) {
for (_Stn stnObj : d.getSentences()) {
stnObj.setTopicsVct(number_of_topics);
}
d.setTopics4Gibbs(number_of_topics, 0);
} else if (d instanceof _ChildDoc) {
((_ChildDoc) d).setTopics4Gibbs_LDA(number_of_topics, 0);
}
for (_Word w : d.getWords()) {
word_topic_sstat[w.getTopic()][w.getIndex()]++;
m_sstat[w.getTopic()]++;
}
}
imposePrior();
m_statisticsNormalized = false;
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class corrLDA_Gibbs method initTest.
protected void initTest(ArrayList<_Doc> sampleTestSet, _Doc d) {
_ParentDoc pDoc = (_ParentDoc) d;
for (_Stn stnObj : pDoc.getSentences()) {
stnObj.setTopicsVct(number_of_topics);
}
int testLength = (int) (m_testWord4PerplexityProportion * pDoc.getTotalDocLength());
pDoc.setTopics4GibbsTest(number_of_topics, 0, testLength);
sampleTestSet.add(pDoc);
for (_ChildDoc cDoc : pDoc.m_childDocs) {
testLength = (int) (m_testWord4PerplexityProportion * cDoc.getTotalDocLength());
cDoc.setTopics4GibbsTest(number_of_topics, 0, testLength);
sampleTestSet.add(cDoc);
}
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class languageModelBaseLine method rankChild4StnByLanguageModel.
protected HashMap<String, Double> rankChild4StnByLanguageModel(_Stn stnObj, _ParentDoc pDoc) {
HashMap<String, Double> childLikelihoodMap = new HashMap<String, Double>();
double smoothingMu = 1000;
for (_ChildDoc cDoc : pDoc.m_childDocs) {
int cDocLen = cDoc.getTotalDocLength();
_SparseFeature[] fv = cDoc.getSparse();
double stnLogLikelihood = 0;
double alphaDoc = smoothingMu / (smoothingMu + cDocLen);
_SparseFeature[] sv = stnObj.getFv();
for (_SparseFeature svWord : sv) {
double featureLikelihood = 0;
int wid = svWord.getIndex();
double stnVal = svWord.getValue();
int featureIndex = Utils.indexOf(fv, wid);
double docVal = 0;
if (featureIndex != -1) {
docVal = fv[featureIndex].getValue();
}
double smoothingProb = (1 - alphaDoc) * docVal / (cDocLen);
smoothingProb += alphaDoc * getReferenceProb(wid);
featureLikelihood = Math.log(smoothingProb);
stnLogLikelihood += stnVal * featureLikelihood;
}
childLikelihoodMap.put(cDoc.getName(), stnLogLikelihood);
}
return childLikelihoodMap;
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class HTMM method accPhiStat.
// probabilities of topic assignment
void accPhiStat(_Doc d) {
double prob;
for (int t = 0; t < d.getSenetenceSize(); t++) {
_Stn s = d.getSentence(t);
for (_SparseFeature f : s.getFv()) {
int wid = f.getIndex();
// frequency
double v = f.getValue();
for (int i = 0; i < this.number_of_topics; i++) {
prob = this.p_dwzpsi[t][i];
for (int j = 1; j < this.constant; j++) prob += this.p_dwzpsi[t][i + j * this.number_of_topics];
this.word_topic_sstat[i][wid] += v * prob;
}
}
}
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class HTMM method docSummary.
public void docSummary(String[] productList) {
for (String prodID : productList) {
for (int i = 0; i < this.number_of_topics; i++) {
// top three sentences per topic per product
MyPriorityQueue<_RankItem> stnQueue = new MyPriorityQueue<_RankItem>(3);
for (_Doc d : m_trainSet) {
if (d.getItemID().equalsIgnoreCase(prodID)) {
for (int j = 0; j < d.getSenetenceSize(); j++) {
_Stn sentence = d.getSentence(j);
double prob = d.m_topics[i];
for (_SparseFeature f : sentence.getFv()) prob += f.getValue() * topic_term_probabilty[i][f.getIndex()];
prob /= sentence.getLength();
stnQueue.add(new _RankItem(sentence.getRawSentence(), prob));
}
}
}
System.out.format("Product: %s, Topic: %d\n", prodID, i);
summaryWriter.format("Product: %s, Topic: %d\n", prodID, i);
for (_RankItem it : stnQueue) {
System.out.format("%s\t%.3f\n", it.m_name, it.m_value);
summaryWriter.format("%s\t%.3f\n", it.m_name, it.m_value);
}
}
}
summaryWriter.flush();
summaryWriter.close();
}
Aggregations