use of structures._Stn in project IR_Base by Linda-sunshine.
the class AspectAnalyzer method Annotate.
void Annotate(_Doc d) {
for (int i = 0; i < d.getSenetenceSize(); i++) {
int maxCount = 0, count, sel = -1;
_Stn s = d.getSentence(i);
for (int index = 0; index < m_aspects.size(); index++) {
if ((count = s.AnnotateByKeyword(m_aspects.get(index).m_keywords)) > maxCount) {
maxCount = count;
sel = index;
} else if (count == maxCount)
// how should we break the tie?
sel = -1;
}
s.setTopic(sel);
}
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class HTSMAnalyzer method analyzeSectionWithStnSplit.
int analyzeSectionWithStnSplit(String content, int y, int sLabel, HashMap<Integer, Double> docVct, ArrayList<HashMap<Integer, Double>> spVcts, ArrayList<_Stn> stnList) {
TokenizeResult result = TokenizerNormalizeStemmer(content);
HashMap<Integer, Double> vPtr;
int stnCount = 0;
for (String sentence : m_stnDetector.sentDetect(content)) {
result = TokenizerNormalizeStemmer(sentence);
vPtr = constructSpVct(result.getTokens(), y, docVct);
if (vPtr.size() > 0) {
// avoid empty sentence
// POS tagging has to be on the raw tokens
String[] posTags = m_tagger.tag(result.getRawTokens());
// 0 for pos
stnList.add(new _Stn(Utils.createSpVct(vPtr), result.getRawTokens(), posTags, sentence, sLabel));
stnCount++;
Utils.mergeVectors(vPtr, docVct);
spVcts.add(vPtr);
}
}
return stnCount;
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class HTSMAnalyzer method AnalyzeNewEggPostWithSentence.
protected boolean AnalyzeNewEggPostWithSentence(_NewEggPost post) throws ParseException {
String content;
// to avoid empty sentences
ArrayList<_Stn> stnList = new ArrayList<_Stn>();
// Collect the index and counts of features.
ArrayList<HashMap<Integer, Double>> spVcts = new ArrayList<HashMap<Integer, Double>>();
StringBuffer buffer = m_releaseContent ? null : new StringBuffer(256);
// docVct is used to collect DF statistics
HashMap<Integer, Double> docVct = new HashMap<Integer, Double>();
int y = post.getLabel() - 1;
int prosSentenceCounter = 0, consSentenceCounter = 0;
if ((m_prosConsLoad == LoadType.LT_pros || m_prosConsLoad == LoadType.LT_procon || m_prosConsLoad == LoadType.LT_all) && (content = post.getProContent()) != null) {
// sentences in pro section
prosSentenceCounter = analyzeSectionWithStnSplit(content, y, 0, docVct, spVcts, stnList);
if (!m_releaseContent)
buffer.append(String.format("Pros: %s\n", content));
}
if ((m_prosConsLoad == LoadType.LT_cons || m_prosConsLoad == LoadType.LT_procon || m_prosConsLoad == LoadType.LT_all) && (content = post.getConContent()) != null) {
// tokenize cons
consSentenceCounter = analyzeSectionWithStnSplit(content, y, 1, docVct, spVcts, stnList);
if (!m_releaseContent)
buffer.append(String.format("Cons: %s\n", content));
}
if ((m_prosConsLoad == LoadType.LT_comments || m_prosConsLoad == LoadType.LT_all) && (content = post.getComments()) != null) {
// tokenize comments
analyzeSectionWithStnSplit(content, y, -1, docVct, spVcts, stnList);
if (!m_releaseContent)
buffer.append(String.format("Comments: %s\n", content));
}
if (docVct.size() >= m_lengthThreshold && stnList.size() >= m_stnSizeThreshold) {
long timeStamp = m_dateFormatter.parse(post.getDate()).getTime();
// int ID, String name, String prodID, String title, String source, int ylabel, long timeStamp
_Doc doc = new _Doc(m_corpus.getSize(), post.getID(), post.getProdId(), post.getTitle(), (m_releaseContent ? null : buffer.toString()), y, timeStamp);
// source = 2 means the Document is from newEgg
doc.setSourceType(2);
doc.createSpVct(spVcts);
doc.setSentences(stnList);
setStnFvs(doc);
m_corpus.addDoc(doc);
m_classMemberNo[y]++;
m_prosStnCount += prosSentenceCounter;
m_consStnCount += consSentenceCounter;
return true;
} else
return false;
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class LDAGibbs4AC method initialize_probability.
protected void initialize_probability(Collection<_Doc> collection) {
createSpace();
for (int i = 0; i < number_of_topics; i++) {
Arrays.fill(topic_term_probabilty[i], 0);
Arrays.fill(word_topic_sstat[i], d_beta);
}
Arrays.fill(m_sstat, d_beta * vocabulary_size);
for (_Doc d : collection) {
if (d instanceof _ParentDoc) {
for (_Stn stnObj : d.getSentences()) {
stnObj.setTopicsVct(number_of_topics);
}
d.setTopics4Gibbs(number_of_topics, d_alpha);
} else if (d instanceof _ChildDoc) {
((_ChildDoc) d).setTopics4Gibbs_LDA(number_of_topics, d_alpha);
}
for (_Word w : d.getWords()) {
word_topic_sstat[w.getTopic()][w.getIndex()]++;
m_sstat[w.getTopic()]++;
}
}
imposePrior();
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class LDAGibbs4AC_test method discoverSpecificComments.
protected void discoverSpecificComments(String similarityFile) {
System.out.println("topic similarity");
try {
PrintWriter pw = new PrintWriter(new File(similarityFile));
for (_Doc doc : m_trainSet) {
if (doc instanceof _ParentDoc) {
pw.print(doc.getName() + "\t");
double stnTopicSimilarity = 0.0;
double docTopicSimilarity = 0.0;
for (_ChildDoc cDoc : ((_ParentDoc) doc).m_childDocs) {
pw.print(cDoc.getName() + ":");
docTopicSimilarity = computeSimilarity(((_ParentDoc) doc).m_topics, cDoc.m_topics);
pw.print(docTopicSimilarity);
for (_Stn stnObj : doc.getSentences()) {
stnTopicSimilarity = computeSimilarity(stnObj.m_topics, cDoc.m_topics);
pw.print(":" + (stnObj.getIndex() + 1) + ":" + stnTopicSimilarity);
}
pw.print("\t");
}
pw.println();
}
}
pw.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
Aggregations