Search in sources :

Example 1 with structures._Stn

use of structures._Stn in project IR_Base by Linda-sunshine.

the class AspectAnalyzer method Annotate.

void Annotate(_Doc d) {
    for (int i = 0; i < d.getSenetenceSize(); i++) {
        int maxCount = 0, count, sel = -1;
        _Stn s = d.getSentence(i);
        for (int index = 0; index < m_aspects.size(); index++) {
            if ((count = s.AnnotateByKeyword(m_aspects.get(index).m_keywords)) > maxCount) {
                maxCount = count;
                sel = index;
            } else if (count == maxCount)
                // how should we break the tie?
                sel = -1;
        }
        s.setTopic(sel);
    }
}
Also used : structures._Stn(structures._Stn)

Example 2 with structures._Stn

use of structures._Stn in project IR_Base by Linda-sunshine.

the class HTSMAnalyzer method analyzeSectionWithStnSplit.

int analyzeSectionWithStnSplit(String content, int y, int sLabel, HashMap<Integer, Double> docVct, ArrayList<HashMap<Integer, Double>> spVcts, ArrayList<_Stn> stnList) {
    TokenizeResult result = TokenizerNormalizeStemmer(content);
    HashMap<Integer, Double> vPtr;
    int stnCount = 0;
    for (String sentence : m_stnDetector.sentDetect(content)) {
        result = TokenizerNormalizeStemmer(sentence);
        vPtr = constructSpVct(result.getTokens(), y, docVct);
        if (vPtr.size() > 0) {
            // avoid empty sentence
            // POS tagging has to be on the raw tokens
            String[] posTags = m_tagger.tag(result.getRawTokens());
            // 0 for pos
            stnList.add(new _Stn(Utils.createSpVct(vPtr), result.getRawTokens(), posTags, sentence, sLabel));
            stnCount++;
            Utils.mergeVectors(vPtr, docVct);
            spVcts.add(vPtr);
        }
    }
    return stnCount;
}
Also used : TokenizeResult(structures.TokenizeResult) structures._Stn(structures._Stn)

Example 3 with structures._Stn

use of structures._Stn in project IR_Base by Linda-sunshine.

the class HTSMAnalyzer method AnalyzeNewEggPostWithSentence.

protected boolean AnalyzeNewEggPostWithSentence(_NewEggPost post) throws ParseException {
    String content;
    // to avoid empty sentences
    ArrayList<_Stn> stnList = new ArrayList<_Stn>();
    // Collect the index and counts of features.
    ArrayList<HashMap<Integer, Double>> spVcts = new ArrayList<HashMap<Integer, Double>>();
    StringBuffer buffer = m_releaseContent ? null : new StringBuffer(256);
    // docVct is used to collect DF statistics
    HashMap<Integer, Double> docVct = new HashMap<Integer, Double>();
    int y = post.getLabel() - 1;
    int prosSentenceCounter = 0, consSentenceCounter = 0;
    if ((m_prosConsLoad == LoadType.LT_pros || m_prosConsLoad == LoadType.LT_procon || m_prosConsLoad == LoadType.LT_all) && (content = post.getProContent()) != null) {
        // sentences in pro section
        prosSentenceCounter = analyzeSectionWithStnSplit(content, y, 0, docVct, spVcts, stnList);
        if (!m_releaseContent)
            buffer.append(String.format("Pros: %s\n", content));
    }
    if ((m_prosConsLoad == LoadType.LT_cons || m_prosConsLoad == LoadType.LT_procon || m_prosConsLoad == LoadType.LT_all) && (content = post.getConContent()) != null) {
        // tokenize cons
        consSentenceCounter = analyzeSectionWithStnSplit(content, y, 1, docVct, spVcts, stnList);
        if (!m_releaseContent)
            buffer.append(String.format("Cons: %s\n", content));
    }
    if ((m_prosConsLoad == LoadType.LT_comments || m_prosConsLoad == LoadType.LT_all) && (content = post.getComments()) != null) {
        // tokenize comments
        analyzeSectionWithStnSplit(content, y, -1, docVct, spVcts, stnList);
        if (!m_releaseContent)
            buffer.append(String.format("Comments: %s\n", content));
    }
    if (docVct.size() >= m_lengthThreshold && stnList.size() >= m_stnSizeThreshold) {
        long timeStamp = m_dateFormatter.parse(post.getDate()).getTime();
        // int ID, String name, String prodID, String title, String source, int ylabel, long timeStamp
        _Doc doc = new _Doc(m_corpus.getSize(), post.getID(), post.getProdId(), post.getTitle(), (m_releaseContent ? null : buffer.toString()), y, timeStamp);
        // source = 2 means the Document is from newEgg
        doc.setSourceType(2);
        doc.createSpVct(spVcts);
        doc.setSentences(stnList);
        setStnFvs(doc);
        m_corpus.addDoc(doc);
        m_classMemberNo[y]++;
        m_prosStnCount += prosSentenceCounter;
        m_consStnCount += consSentenceCounter;
        return true;
    } else
        return false;
}
Also used : structures._Stn(structures._Stn) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) structures._Doc(structures._Doc)

Example 4 with structures._Stn

use of structures._Stn in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC method initialize_probability.

protected void initialize_probability(Collection<_Doc> collection) {
    createSpace();
    for (int i = 0; i < number_of_topics; i++) {
        Arrays.fill(topic_term_probabilty[i], 0);
        Arrays.fill(word_topic_sstat[i], d_beta);
    }
    Arrays.fill(m_sstat, d_beta * vocabulary_size);
    for (_Doc d : collection) {
        if (d instanceof _ParentDoc) {
            for (_Stn stnObj : d.getSentences()) {
                stnObj.setTopicsVct(number_of_topics);
            }
            d.setTopics4Gibbs(number_of_topics, d_alpha);
        } else if (d instanceof _ChildDoc) {
            ((_ChildDoc) d).setTopics4Gibbs_LDA(number_of_topics, d_alpha);
        }
        for (_Word w : d.getWords()) {
            word_topic_sstat[w.getTopic()][w.getIndex()]++;
            m_sstat[w.getTopic()]++;
        }
    }
    imposePrior();
}
Also used : structures._Stn(structures._Stn) structures._ChildDoc(structures._ChildDoc) structures._Doc(structures._Doc) structures._ParentDoc(structures._ParentDoc) structures._Word(structures._Word)

Example 5 with structures._Stn

use of structures._Stn in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method discoverSpecificComments.

protected void discoverSpecificComments(String similarityFile) {
    System.out.println("topic similarity");
    try {
        PrintWriter pw = new PrintWriter(new File(similarityFile));
        for (_Doc doc : m_trainSet) {
            if (doc instanceof _ParentDoc) {
                pw.print(doc.getName() + "\t");
                double stnTopicSimilarity = 0.0;
                double docTopicSimilarity = 0.0;
                for (_ChildDoc cDoc : ((_ParentDoc) doc).m_childDocs) {
                    pw.print(cDoc.getName() + ":");
                    docTopicSimilarity = computeSimilarity(((_ParentDoc) doc).m_topics, cDoc.m_topics);
                    pw.print(docTopicSimilarity);
                    for (_Stn stnObj : doc.getSentences()) {
                        stnTopicSimilarity = computeSimilarity(stnObj.m_topics, cDoc.m_topics);
                        pw.print(":" + (stnObj.getIndex() + 1) + ":" + stnTopicSimilarity);
                    }
                    pw.print("\t");
                }
                pw.println();
            }
        }
        pw.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    }
}
Also used : structures._ChildDoc(structures._ChildDoc) structures._Stn(structures._Stn) structures._Doc(structures._Doc) structures._ParentDoc(structures._ParentDoc) FileNotFoundException(java.io.FileNotFoundException) File(java.io.File) PrintWriter(java.io.PrintWriter)

Aggregations

structures._Stn (structures._Stn)46 structures._ChildDoc (structures._ChildDoc)33 structures._ParentDoc (structures._ParentDoc)27 structures._Doc (structures._Doc)22 HashMap (java.util.HashMap)19 File (java.io.File)17 PrintWriter (java.io.PrintWriter)17 structures._Word (structures._Word)16 FileNotFoundException (java.io.FileNotFoundException)15 structures._SparseFeature (structures._SparseFeature)12 structures._ParentDoc4DCM (structures._ParentDoc4DCM)6 Map (java.util.Map)5 structures._ChildDoc4BaseWithPhi (structures._ChildDoc4BaseWithPhi)4 ArrayList (java.util.ArrayList)3 IOException (java.io.IOException)2 ParseException (java.text.ParseException)2 TokenizeResult (structures.TokenizeResult)2 TreeMap (java.util.TreeMap)1 MyPriorityQueue (structures.MyPriorityQueue)1 structures._ChildDoc4BaseWithPhi_Hard (structures._ChildDoc4BaseWithPhi_Hard)1