use of structures.TokenizeResult in project IR_Base by Linda-sunshine.
the class MultiThreadedUserAnalyzer method TokenizerNormalizeStemmer.
// Given a long string, tokenize it, normalie it and stem it, return back the string array.
protected TokenizeResult TokenizerNormalizeStemmer(String source, int core) {
// Original tokens.
String[] tokens = Tokenizer(source, core);
TokenizeResult result = new TokenizeResult(tokens);
// Normalize them and stem them.
for (int i = 0; i < tokens.length; i++) tokens[i] = SnowballStemming(Normalize(tokens[i]), core);
LinkedList<String> Ngrams = new LinkedList<String>();
int tokenLength = tokens.length, N = m_Ngram;
for (int i = 0; i < tokenLength; i++) {
String token = tokens[i];
boolean legit = isLegit(token);
if (legit)
// unigram
Ngrams.add(token);
else
result.incStopwords();
// N to 2 grams
if (!isBoundary(token)) {
for (int j = i - 1; j >= Math.max(0, i - N + 1); j--) {
if (isBoundary(tokens[j]))
// touch the boundary
break;
token = tokens[j] + "-" + token;
legit |= isLegit(tokens[j]);
if (// at least one of them is legitimate
legit)
Ngrams.add(token);
}
}
}
result.setTokens(Ngrams.toArray(new String[Ngrams.size()]));
return result;
}
use of structures.TokenizeResult in project IR_Base by Linda-sunshine.
the class DocAnalyzer method AnalyzeDocByStn.
protected boolean AnalyzeDocByStn(_Doc doc, String[] sentences) {
TokenizeResult result;
int y = doc.getYLabel(), index = 0;
// Collect the index and counts of features.
HashMap<Integer, Double> spVct = new HashMap<Integer, Double>();
// sparse sentence feature vectors
ArrayList<_Stn> stnList = new ArrayList<_Stn>();
double stopwordCnt = 0, rawCnt = 0;
for (String sentence : sentences) {
// Three-step analysis.
result = TokenizerNormalizeStemmer(sentence);
// construct bag-of-word vector based on normalized tokens
HashMap<Integer, Double> sentence_vector = constructSpVct(result.getTokens(), y, spVct);
if (sentence_vector.size() > 2) {
// avoid empty sentence
String[] posTags;
if (m_tagger == null)
posTags = null;
else
posTags = m_tagger.tag(result.getRawTokens());
stnList.add(new _Stn(index, Utils.createSpVct(sentence_vector), result.getRawTokens(), posTags, sentence));
Utils.mergeVectors(sentence_vector, spVct);
stopwordCnt += result.getStopwordCnt();
rawCnt += result.getRawCnt();
}
index++;
}
// the document should be long enough
if (spVct.size() >= m_lengthThreshold && stnList.size() >= m_stnSizeThreshold) {
doc.createSpVct(spVct);
doc.setStopwordProportion(stopwordCnt / rawCnt);
doc.setSentences(stnList);
m_corpus.addDoc(doc);
m_classMemberNo[y]++;
if (m_releaseContent)
doc.clearSource();
return true;
} else {
/**
**Roll back here!!*****
*/
rollBack(spVct, y);
return false;
}
}
use of structures.TokenizeResult in project IR_Base by Linda-sunshine.
the class DocAnalyzer method AnalyzeDoc.
/*Analyze a document and add the analyzed document back to corpus.*/
protected boolean AnalyzeDoc(_Doc doc) {
// Three-step analysis.
TokenizeResult result = TokenizerNormalizeStemmer(doc.getSource());
String[] tokens = result.getTokens();
int y = doc.getYLabel();
// Construct the sparse vector.
HashMap<Integer, Double> spVct = constructSpVct(tokens, y, null);
if (spVct.size() > m_lengthThreshold) {
doc.createSpVct(spVct);
doc.setStopwordProportion(result.getStopwordProportion());
m_corpus.addDoc(doc);
m_classMemberNo[y]++;
if (m_releaseContent)
doc.clearSource();
return true;
} else {
/**
**Roll back here!!*****
*/
rollBack(spVct, y);
return false;
}
}
use of structures.TokenizeResult in project IR_Base by Linda-sunshine.
the class MultiThreadedUserAnalyzer method AnalyzeDoc.
/*Analyze a document and add the analyzed document back to corpus.*/
protected boolean AnalyzeDoc(_Doc doc, int core) {
// Three-step analysis.
TokenizeResult result = TokenizerNormalizeStemmer(doc.getSource(), core);
String[] tokens = result.getTokens();
int y = doc.getYLabel();
// Construct the sparse vector.
HashMap<Integer, Double> spVct = constructSpVct(tokens, y, null);
if (spVct.size() > m_lengthThreshold) {
// temporary code for debugging purpose
doc.createSpVct(spVct);
doc.setStopwordProportion(result.getStopwordProportion());
synchronized (m_corpusLock) {
m_corpus.addDoc(doc);
m_classMemberNo[y]++;
}
if (m_releaseContent)
doc.clearSource();
return true;
} else {
/**
**Roll back here!!*****
*/
synchronized (m_rollbackLock) {
rollBack(spVct, y);
}
return false;
}
}
Aggregations