Search in sources :

Example 1 with TokenizeResult

use of structures.TokenizeResult in project IR_Base by Linda-sunshine.

the class AspectAnalyzer method AnalyzeDocWithStnSplit.

@Override
protected boolean AnalyzeDocWithStnSplit(_Doc doc) {
    double sentiScore = 0;
    TokenizeResult result;
    String[] sentences = m_stnDetector.sentDetect(doc.getSource());
    // Collect the index and counts of features.
    HashMap<Integer, Double> spVct = new HashMap<Integer, Double>();
    // Added by Lin for constructing postagging vector.
    // Collect the index and counts of projected features.
    HashMap<Integer, Double> posTaggingVct = new HashMap<Integer, Double>();
    int y = doc.getYLabel();
    double stopwordCnt = 0, rawCnt = 0;
    for (String sentence : sentences) {
        // Three-step analysis.
        result = TokenizerNormalizeStemmer(sentence);
        String[] rawTokens = result.getRawTokens();
        // only tokenize then POS tagging
        String[] posTags = m_tagger.tag(rawTokens);
        HashMap<Integer, Double> sentence_vector = constructSpVct(result.getTokens(), y, spVct);
        // Added by Lin for constructing postagging vector.
        // Collect the index and counts of features.
        HashMap<Integer, Double> postaggingSentenceVct = constructPOSSpVct(rawTokens, posTags);
        if (sentence_vector.size() > 0) {
            // avoid empty sentence
            Utils.mergeVectors(sentence_vector, spVct);
            Utils.mergeVectors(postaggingSentenceVct, posTaggingVct);
            // since we already have the postagging, we don't need to repeat it.
            sentiScore += sentiWordScore(rawTokens, posTags);
            stopwordCnt += result.getStopwordCnt();
            rawCnt += result.getRawCnt();
        }
    }
    // the document should be long enough
    if (spVct.size() >= m_lengthThreshold) {
        doc.createSpVct(spVct);
        doc.setStopwordProportion(stopwordCnt / rawCnt);
        // added by Lin.
        doc.createPOSVct(posTaggingVct);
        // Added by Lin for detecting aspects of a document.
        doc.setAspVct(detectAspects(spVct));
        // average sentence sentiWordNet score
        doc.setSentiScore(sentiScore / spVct.size());
        m_corpus.addDoc(doc);
        m_classMemberNo[y]++;
        if (m_releaseContent)
            doc.clearSource();
        return true;
    } else {
        /**
         **Roll back here!!*****
         */
        rollBack(spVct, y);
        return false;
    }
}
Also used : TokenizeResult(structures.TokenizeResult) HashMap(java.util.HashMap)

Example 2 with TokenizeResult

use of structures.TokenizeResult in project IR_Base by Linda-sunshine.

the class DocAnalyzer method TokenizerNormalizeStemmer.

// Given a long string, tokenize it, normalie it and stem it, return back the string array.
protected TokenizeResult TokenizerNormalizeStemmer(String source) {
    // Original tokens.
    String[] tokens = Tokenizer(source);
    TokenizeResult result = new TokenizeResult(tokens);
    // Normalize them and stem them.
    for (int i = 0; i < tokens.length; i++) tokens[i] = SnowballStemming(Normalize(tokens[i]));
    LinkedList<String> Ngrams = new LinkedList<String>();
    int tokenLength = tokens.length, N = m_Ngram;
    for (int i = 0; i < tokenLength; i++) {
        String token = tokens[i];
        boolean legit = isLegit(token);
        if (legit)
            // unigram
            Ngrams.add(token);
        else
            result.incStopwords();
        // N to 2 grams
        if (!isBoundary(token)) {
            for (int j = i - 1; j >= Math.max(0, i - N + 1); j--) {
                if (isBoundary(tokens[j]))
                    // touch the boundary
                    break;
                token = tokens[j] + "-" + token;
                legit &= isLegit(tokens[j]);
                if (// at least one of them is legitimate
                legit)
                    Ngrams.add(token);
            }
        }
    }
    result.setTokens(Ngrams.toArray(new String[Ngrams.size()]));
    return result;
}
Also used : TokenizeResult(structures.TokenizeResult) LinkedList(java.util.LinkedList)

Example 3 with TokenizeResult

use of structures.TokenizeResult in project IR_Base by Linda-sunshine.

the class HTSMAnalyzer method analyzeSection.

void analyzeSection(String content, int y, HashMap<Integer, Double> docVct, ArrayList<HashMap<Integer, Double>> spVcts) {
    TokenizeResult result = TokenizerNormalizeStemmer(content);
    String[] tokens = result.getTokens();
    HashMap<Integer, Double> vPtr = constructSpVct(tokens, y, docVct);
    spVcts.add(vPtr);
    Utils.mergeVectors(vPtr, docVct);
}
Also used : TokenizeResult(structures.TokenizeResult)

Example 4 with TokenizeResult

use of structures.TokenizeResult in project IR_Base by Linda-sunshine.

the class HTSMAnalyzer method analyzeSectionWithStnSplit.

int analyzeSectionWithStnSplit(String content, int y, int sLabel, HashMap<Integer, Double> docVct, ArrayList<HashMap<Integer, Double>> spVcts, ArrayList<_Stn> stnList) {
    TokenizeResult result = TokenizerNormalizeStemmer(content);
    HashMap<Integer, Double> vPtr;
    int stnCount = 0;
    for (String sentence : m_stnDetector.sentDetect(content)) {
        result = TokenizerNormalizeStemmer(sentence);
        vPtr = constructSpVct(result.getTokens(), y, docVct);
        if (vPtr.size() > 0) {
            // avoid empty sentence
            // POS tagging has to be on the raw tokens
            String[] posTags = m_tagger.tag(result.getRawTokens());
            // 0 for pos
            stnList.add(new _Stn(Utils.createSpVct(vPtr), result.getRawTokens(), posTags, sentence, sLabel));
            stnCount++;
            Utils.mergeVectors(vPtr, docVct);
            spVcts.add(vPtr);
        }
    }
    return stnCount;
}
Also used : TokenizeResult(structures.TokenizeResult) structures._Stn(structures._Stn)

Example 5 with TokenizeResult

use of structures.TokenizeResult in project IR_Base by Linda-sunshine.

the class MultiThreadedLMAnalyzer method AnalyzeDoc.

@Override
protected // Analyze the sparse features and language model features at the same time.
boolean AnalyzeDoc(_Doc doc, int core) {
    // Three-step analysis.
    TokenizeResult result = TokenizerNormalizeStemmer(doc.getSource(), core);
    String[] tokens = result.getTokens();
    int y = doc.getYLabel();
    // Construct the sparse vector.
    HashMap<Integer, Double> spVct = constructSpVct(tokens, y, null);
    // Construct the sparse vector for the language models.
    HashMap<Integer, Double> lmSpVct = constructLMSpVct(tokens);
    if (spVct.size() > m_lengthThreshold) {
        // temporary code for debugging purpose
        doc.createSpVct(spVct);
        doc.createLMSpVct(lmSpVct);
        doc.setStopwordProportion(result.getStopwordProportion());
        synchronized (m_corpusLock) {
            m_corpus.addDoc(doc);
            m_classMemberNo[y]++;
        }
        if (m_releaseContent)
            doc.clearSource();
        return true;
    } else {
        /**
         **Roll back here!!*****
         */
        synchronized (m_rollbackLock) {
            // no need to roll back lm features.
            rollBack(spVct, y);
        }
        return false;
    }
}
Also used : TokenizeResult(structures.TokenizeResult)

Aggregations

TokenizeResult (structures.TokenizeResult)9 HashMap (java.util.HashMap)2 LinkedList (java.util.LinkedList)2 structures._Stn (structures._Stn)2 ArrayList (java.util.ArrayList)1