Search in sources :

Example 1 with structures._NewEggPost

use of structures._NewEggPost in project IR_Base by Linda-sunshine.

the class HTSMAnalyzer method AnalyzeNewEggPostWithSentence.

protected boolean AnalyzeNewEggPostWithSentence(_NewEggPost post) throws ParseException {
    String content;
    // to avoid empty sentences
    ArrayList<_Stn> stnList = new ArrayList<_Stn>();
    // Collect the index and counts of features.
    ArrayList<HashMap<Integer, Double>> spVcts = new ArrayList<HashMap<Integer, Double>>();
    StringBuffer buffer = m_releaseContent ? null : new StringBuffer(256);
    // docVct is used to collect DF statistics
    HashMap<Integer, Double> docVct = new HashMap<Integer, Double>();
    int y = post.getLabel() - 1;
    int prosSentenceCounter = 0, consSentenceCounter = 0;
    if ((m_prosConsLoad == LoadType.LT_pros || m_prosConsLoad == LoadType.LT_procon || m_prosConsLoad == LoadType.LT_all) && (content = post.getProContent()) != null) {
        // sentences in pro section
        prosSentenceCounter = analyzeSectionWithStnSplit(content, y, 0, docVct, spVcts, stnList);
        if (!m_releaseContent)
            buffer.append(String.format("Pros: %s\n", content));
    }
    if ((m_prosConsLoad == LoadType.LT_cons || m_prosConsLoad == LoadType.LT_procon || m_prosConsLoad == LoadType.LT_all) && (content = post.getConContent()) != null) {
        // tokenize cons
        consSentenceCounter = analyzeSectionWithStnSplit(content, y, 1, docVct, spVcts, stnList);
        if (!m_releaseContent)
            buffer.append(String.format("Cons: %s\n", content));
    }
    if ((m_prosConsLoad == LoadType.LT_comments || m_prosConsLoad == LoadType.LT_all) && (content = post.getComments()) != null) {
        // tokenize comments
        analyzeSectionWithStnSplit(content, y, -1, docVct, spVcts, stnList);
        if (!m_releaseContent)
            buffer.append(String.format("Comments: %s\n", content));
    }
    if (docVct.size() >= m_lengthThreshold && stnList.size() >= m_stnSizeThreshold) {
        long timeStamp = m_dateFormatter.parse(post.getDate()).getTime();
        // int ID, String name, String prodID, String title, String source, int ylabel, long timeStamp
        _Doc doc = new _Doc(m_corpus.getSize(), post.getID(), post.getProdId(), post.getTitle(), (m_releaseContent ? null : buffer.toString()), y, timeStamp);
        // source = 2 means the Document is from newEgg
        doc.setSourceType(2);
        doc.createSpVct(spVcts);
        doc.setSentences(stnList);
        setStnFvs(doc);
        m_corpus.addDoc(doc);
        m_classMemberNo[y]++;
        m_prosStnCount += prosSentenceCounter;
        m_consStnCount += consSentenceCounter;
        return true;
    } else
        return false;
}
Also used : structures._Stn(structures._Stn) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) structures._Doc(structures._Doc)

Example 2 with structures._NewEggPost

use of structures._NewEggPost in project IR_Base by Linda-sunshine.

the class HTSMAnalyzer method AnalyzeNewEggPost.

protected boolean AnalyzeNewEggPost(_NewEggPost post) throws ParseException {
    String content;
    StringBuffer buffer = m_releaseContent ? null : new StringBuffer(256);
    // docVct is used to collect DF
    HashMap<Integer, Double> docVct = new HashMap<Integer, Double>();
    // Collect the index and counts of features.
    ArrayList<HashMap<Integer, Double>> spVcts = new ArrayList<HashMap<Integer, Double>>();
    int y = post.getLabel() - 1;
    if ((m_prosConsLoad == LoadType.LT_all || m_prosConsLoad == LoadType.LT_procon || m_prosConsLoad == LoadType.LT_pros) && (content = post.getProContent()) != null) {
        // load pro section
        analyzeSection(content, y, docVct, spVcts);
        if (!m_releaseContent)
            buffer.append(String.format("Pros: %s\n", content));
    }
    if ((m_prosConsLoad == LoadType.LT_all || m_prosConsLoad == LoadType.LT_procon || m_prosConsLoad == LoadType.LT_cons) && (content = post.getConContent()) != null) {
        // load con section
        analyzeSection(content, y, docVct, spVcts);
        if (!m_releaseContent)
            buffer.append(String.format("Cons: %s\n", content));
    }
    if ((m_prosConsLoad == LoadType.LT_all || m_prosConsLoad == LoadType.LT_comments) && (content = post.getComments()) != null) {
        // load comment section
        analyzeSection(content, y, docVct, spVcts);
        if (!m_releaseContent)
            buffer.append(String.format("Comments: %s\n", content));
    }
    if (docVct.size() >= m_lengthThreshold) {
        long timeStamp = m_dateFormatter.parse(post.getDate()).getTime();
        // int ID, String name, String prodID, String title, String source, int ylabel, long timeStamp
        _Doc doc = new _Doc(m_corpus.getSize(), post.getID(), post.getProdId(), post.getTitle(), (m_releaseContent ? null : buffer.toString()), y, timeStamp);
        // 2 means from newEgg
        doc.setSourceType(2);
        doc.createSpVct(spVcts);
        m_corpus.addDoc(doc);
        m_classMemberNo[y]++;
        return true;
    } else
        return false;
}
Also used : HashMap(java.util.HashMap) structures._Doc(structures._Doc) ArrayList(java.util.ArrayList)

Example 3 with structures._NewEggPost

use of structures._NewEggPost in project IR_Base by Linda-sunshine.

the class HTSMAnalyzer method LoadNewEggDoc.

// Load a document and analyze it.
public void LoadNewEggDoc(String filename) {
    JSONObject prod = null;
    String item;
    JSONArray itemIds, reviews;
    try {
        JSONObject json = LoadJSON(filename);
        prod = json.getJSONObject(m_category);
        itemIds = prod.names();
        System.out.printf("Under %s category, Number of Items: %d\n", m_category, itemIds.length());
    } catch (Exception e) {
        System.out.print('X');
        return;
    }
    for (int i = 0; i < itemIds.length(); i++) {
        try {
            item = itemIds.getString(i);
            reviews = prod.getJSONArray(item);
            for (int j = 0; j < reviews.length(); j++) {
                if (this.m_stnDetector != null)
                    AnalyzeNewEggPostWithSentence(new _NewEggPost(reviews.getJSONObject(j), item));
                else
                    AnalyzeNewEggPost(new _NewEggPost(reviews.getJSONObject(j), item));
            }
        } catch (JSONException e) {
            System.out.print('P');
            e.printStackTrace();
        } catch (ParseException e) {
            e.printStackTrace();
        }
    }
}
Also used : JSONObject(json.JSONObject) JSONArray(json.JSONArray) structures._NewEggPost(structures._NewEggPost) JSONException(json.JSONException) ParseException(java.text.ParseException) JSONException(json.JSONException) IOException(java.io.IOException) InvalidFormatException(opennlp.tools.util.InvalidFormatException) FileNotFoundException(java.io.FileNotFoundException) ParseException(java.text.ParseException)

Aggregations

ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 structures._Doc (structures._Doc)2 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 ParseException (java.text.ParseException)1 JSONArray (json.JSONArray)1 JSONException (json.JSONException)1 JSONObject (json.JSONObject)1 InvalidFormatException (opennlp.tools.util.InvalidFormatException)1 structures._NewEggPost (structures._NewEggPost)1 structures._Stn (structures._Stn)1