use of structures.TokenizeResult in project IR_Base by Linda-sunshine.
the class AspectAnalyzer method AnalyzeDocWithStnSplit.
@Override
protected boolean AnalyzeDocWithStnSplit(_Doc doc) {
double sentiScore = 0;
TokenizeResult result;
String[] sentences = m_stnDetector.sentDetect(doc.getSource());
// Collect the index and counts of features.
HashMap<Integer, Double> spVct = new HashMap<Integer, Double>();
// Added by Lin for constructing postagging vector.
// Collect the index and counts of projected features.
HashMap<Integer, Double> posTaggingVct = new HashMap<Integer, Double>();
int y = doc.getYLabel();
double stopwordCnt = 0, rawCnt = 0;
for (String sentence : sentences) {
// Three-step analysis.
result = TokenizerNormalizeStemmer(sentence);
String[] rawTokens = result.getRawTokens();
// only tokenize then POS tagging
String[] posTags = m_tagger.tag(rawTokens);
HashMap<Integer, Double> sentence_vector = constructSpVct(result.getTokens(), y, spVct);
// Added by Lin for constructing postagging vector.
// Collect the index and counts of features.
HashMap<Integer, Double> postaggingSentenceVct = constructPOSSpVct(rawTokens, posTags);
if (sentence_vector.size() > 0) {
// avoid empty sentence
Utils.mergeVectors(sentence_vector, spVct);
Utils.mergeVectors(postaggingSentenceVct, posTaggingVct);
// since we already have the postagging, we don't need to repeat it.
sentiScore += sentiWordScore(rawTokens, posTags);
stopwordCnt += result.getStopwordCnt();
rawCnt += result.getRawCnt();
}
}
// the document should be long enough
if (spVct.size() >= m_lengthThreshold) {
doc.createSpVct(spVct);
doc.setStopwordProportion(stopwordCnt / rawCnt);
// added by Lin.
doc.createPOSVct(posTaggingVct);
// Added by Lin for detecting aspects of a document.
doc.setAspVct(detectAspects(spVct));
// average sentence sentiWordNet score
doc.setSentiScore(sentiScore / spVct.size());
m_corpus.addDoc(doc);
m_classMemberNo[y]++;
if (m_releaseContent)
doc.clearSource();
return true;
} else {
/**
**Roll back here!!*****
*/
rollBack(spVct, y);
return false;
}
}
use of structures.TokenizeResult in project IR_Base by Linda-sunshine.
the class DocAnalyzer method TokenizerNormalizeStemmer.
// Given a long string, tokenize it, normalie it and stem it, return back the string array.
protected TokenizeResult TokenizerNormalizeStemmer(String source) {
// Original tokens.
String[] tokens = Tokenizer(source);
TokenizeResult result = new TokenizeResult(tokens);
// Normalize them and stem them.
for (int i = 0; i < tokens.length; i++) tokens[i] = SnowballStemming(Normalize(tokens[i]));
LinkedList<String> Ngrams = new LinkedList<String>();
int tokenLength = tokens.length, N = m_Ngram;
for (int i = 0; i < tokenLength; i++) {
String token = tokens[i];
boolean legit = isLegit(token);
if (legit)
// unigram
Ngrams.add(token);
else
result.incStopwords();
// N to 2 grams
if (!isBoundary(token)) {
for (int j = i - 1; j >= Math.max(0, i - N + 1); j--) {
if (isBoundary(tokens[j]))
// touch the boundary
break;
token = tokens[j] + "-" + token;
legit &= isLegit(tokens[j]);
if (// at least one of them is legitimate
legit)
Ngrams.add(token);
}
}
}
result.setTokens(Ngrams.toArray(new String[Ngrams.size()]));
return result;
}
use of structures.TokenizeResult in project IR_Base by Linda-sunshine.
the class HTSMAnalyzer method analyzeSection.
void analyzeSection(String content, int y, HashMap<Integer, Double> docVct, ArrayList<HashMap<Integer, Double>> spVcts) {
TokenizeResult result = TokenizerNormalizeStemmer(content);
String[] tokens = result.getTokens();
HashMap<Integer, Double> vPtr = constructSpVct(tokens, y, docVct);
spVcts.add(vPtr);
Utils.mergeVectors(vPtr, docVct);
}
use of structures.TokenizeResult in project IR_Base by Linda-sunshine.
the class HTSMAnalyzer method analyzeSectionWithStnSplit.
int analyzeSectionWithStnSplit(String content, int y, int sLabel, HashMap<Integer, Double> docVct, ArrayList<HashMap<Integer, Double>> spVcts, ArrayList<_Stn> stnList) {
TokenizeResult result = TokenizerNormalizeStemmer(content);
HashMap<Integer, Double> vPtr;
int stnCount = 0;
for (String sentence : m_stnDetector.sentDetect(content)) {
result = TokenizerNormalizeStemmer(sentence);
vPtr = constructSpVct(result.getTokens(), y, docVct);
if (vPtr.size() > 0) {
// avoid empty sentence
// POS tagging has to be on the raw tokens
String[] posTags = m_tagger.tag(result.getRawTokens());
// 0 for pos
stnList.add(new _Stn(Utils.createSpVct(vPtr), result.getRawTokens(), posTags, sentence, sLabel));
stnCount++;
Utils.mergeVectors(vPtr, docVct);
spVcts.add(vPtr);
}
}
return stnCount;
}
use of structures.TokenizeResult in project IR_Base by Linda-sunshine.
the class MultiThreadedLMAnalyzer method AnalyzeDoc.
@Override
protected // Analyze the sparse features and language model features at the same time.
boolean AnalyzeDoc(_Doc doc, int core) {
// Three-step analysis.
TokenizeResult result = TokenizerNormalizeStemmer(doc.getSource(), core);
String[] tokens = result.getTokens();
int y = doc.getYLabel();
// Construct the sparse vector.
HashMap<Integer, Double> spVct = constructSpVct(tokens, y, null);
// Construct the sparse vector for the language models.
HashMap<Integer, Double> lmSpVct = constructLMSpVct(tokens);
if (spVct.size() > m_lengthThreshold) {
// temporary code for debugging purpose
doc.createSpVct(spVct);
doc.createLMSpVct(lmSpVct);
doc.setStopwordProportion(result.getStopwordProportion());
synchronized (m_corpusLock) {
m_corpus.addDoc(doc);
m_classMemberNo[y]++;
}
if (m_releaseContent)
doc.clearSource();
return true;
} else {
/**
**Roll back here!!*****
*/
synchronized (m_rollbackLock) {
// no need to roll back lm features.
rollBack(spVct, y);
}
return false;
}
}
Aggregations