Search in sources :

Example 6 with TokenizeResult

use of structures.TokenizeResult in project IR_Base by Linda-sunshine.

the class MultiThreadedUserAnalyzer method TokenizerNormalizeStemmer.

// Given a long string, tokenize it, normalie it and stem it, return back the string array.
protected TokenizeResult TokenizerNormalizeStemmer(String source, int core) {
    // Original tokens.
    String[] tokens = Tokenizer(source, core);
    TokenizeResult result = new TokenizeResult(tokens);
    // Normalize them and stem them.
    for (int i = 0; i < tokens.length; i++) tokens[i] = SnowballStemming(Normalize(tokens[i]), core);
    LinkedList<String> Ngrams = new LinkedList<String>();
    int tokenLength = tokens.length, N = m_Ngram;
    for (int i = 0; i < tokenLength; i++) {
        String token = tokens[i];
        boolean legit = isLegit(token);
        if (legit)
            // unigram
            Ngrams.add(token);
        else
            result.incStopwords();
        // N to 2 grams
        if (!isBoundary(token)) {
            for (int j = i - 1; j >= Math.max(0, i - N + 1); j--) {
                if (isBoundary(tokens[j]))
                    // touch the boundary
                    break;
                token = tokens[j] + "-" + token;
                legit |= isLegit(tokens[j]);
                if (// at least one of them is legitimate
                legit)
                    Ngrams.add(token);
            }
        }
    }
    result.setTokens(Ngrams.toArray(new String[Ngrams.size()]));
    return result;
}
Also used : TokenizeResult(structures.TokenizeResult) LinkedList(java.util.LinkedList)

Example 7 with TokenizeResult

use of structures.TokenizeResult in project IR_Base by Linda-sunshine.

the class DocAnalyzer method AnalyzeDocByStn.

protected boolean AnalyzeDocByStn(_Doc doc, String[] sentences) {
    TokenizeResult result;
    int y = doc.getYLabel(), index = 0;
    // Collect the index and counts of features.
    HashMap<Integer, Double> spVct = new HashMap<Integer, Double>();
    // sparse sentence feature vectors
    ArrayList<_Stn> stnList = new ArrayList<_Stn>();
    double stopwordCnt = 0, rawCnt = 0;
    for (String sentence : sentences) {
        // Three-step analysis.
        result = TokenizerNormalizeStemmer(sentence);
        // construct bag-of-word vector based on normalized tokens
        HashMap<Integer, Double> sentence_vector = constructSpVct(result.getTokens(), y, spVct);
        if (sentence_vector.size() > 2) {
            // avoid empty sentence
            String[] posTags;
            if (m_tagger == null)
                posTags = null;
            else
                posTags = m_tagger.tag(result.getRawTokens());
            stnList.add(new _Stn(index, Utils.createSpVct(sentence_vector), result.getRawTokens(), posTags, sentence));
            Utils.mergeVectors(sentence_vector, spVct);
            stopwordCnt += result.getStopwordCnt();
            rawCnt += result.getRawCnt();
        }
        index++;
    }
    // the document should be long enough
    if (spVct.size() >= m_lengthThreshold && stnList.size() >= m_stnSizeThreshold) {
        doc.createSpVct(spVct);
        doc.setStopwordProportion(stopwordCnt / rawCnt);
        doc.setSentences(stnList);
        m_corpus.addDoc(doc);
        m_classMemberNo[y]++;
        if (m_releaseContent)
            doc.clearSource();
        return true;
    } else {
        /**
         **Roll back here!!*****
         */
        rollBack(spVct, y);
        return false;
    }
}
Also used : TokenizeResult(structures.TokenizeResult) structures._Stn(structures._Stn) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList)

Example 8 with TokenizeResult

use of structures.TokenizeResult in project IR_Base by Linda-sunshine.

the class DocAnalyzer method AnalyzeDoc.

/*Analyze a document and add the analyzed document back to corpus.*/
protected boolean AnalyzeDoc(_Doc doc) {
    // Three-step analysis.
    TokenizeResult result = TokenizerNormalizeStemmer(doc.getSource());
    String[] tokens = result.getTokens();
    int y = doc.getYLabel();
    // Construct the sparse vector.
    HashMap<Integer, Double> spVct = constructSpVct(tokens, y, null);
    if (spVct.size() > m_lengthThreshold) {
        doc.createSpVct(spVct);
        doc.setStopwordProportion(result.getStopwordProportion());
        m_corpus.addDoc(doc);
        m_classMemberNo[y]++;
        if (m_releaseContent)
            doc.clearSource();
        return true;
    } else {
        /**
         **Roll back here!!*****
         */
        rollBack(spVct, y);
        return false;
    }
}
Also used : TokenizeResult(structures.TokenizeResult)

Example 9 with TokenizeResult

use of structures.TokenizeResult in project IR_Base by Linda-sunshine.

the class MultiThreadedUserAnalyzer method AnalyzeDoc.

/*Analyze a document and add the analyzed document back to corpus.*/
protected boolean AnalyzeDoc(_Doc doc, int core) {
    // Three-step analysis.
    TokenizeResult result = TokenizerNormalizeStemmer(doc.getSource(), core);
    String[] tokens = result.getTokens();
    int y = doc.getYLabel();
    // Construct the sparse vector.
    HashMap<Integer, Double> spVct = constructSpVct(tokens, y, null);
    if (spVct.size() > m_lengthThreshold) {
        // temporary code for debugging purpose
        doc.createSpVct(spVct);
        doc.setStopwordProportion(result.getStopwordProportion());
        synchronized (m_corpusLock) {
            m_corpus.addDoc(doc);
            m_classMemberNo[y]++;
        }
        if (m_releaseContent)
            doc.clearSource();
        return true;
    } else {
        /**
         **Roll back here!!*****
         */
        synchronized (m_rollbackLock) {
            rollBack(spVct, y);
        }
        return false;
    }
}
Also used : TokenizeResult(structures.TokenizeResult)

Aggregations

TokenizeResult (structures.TokenizeResult)9 HashMap (java.util.HashMap)2 LinkedList (java.util.LinkedList)2 structures._Stn (structures._Stn)2 ArrayList (java.util.ArrayList)1