use of structures._Stn in project IR_Base by Linda-sunshine.
the class outputFile method outputFiles.
public static void outputFiles(String filePrefix, _Corpus c) {
try {
String selectedSentencesinParentFile = filePrefix + "/selected_Stn.txt";
String selectedCommentsFile = filePrefix + "/selected_Comments.txt";
String sctmFormatParentFile = filePrefix + "/abagf.AT.txt";
String sctmFormatChildFile = filePrefix + "/cbagf.AT.txt";
String sctmWordFile = filePrefix + "/words.AT.txt";
String stnLengthFile = filePrefix + "/selected_StnLength.txt";
String shortStnFile = filePrefix + "/selected_ShortStn.txt";
String longStnFile = filePrefix + "/selected_LongStn.txt";
if (c.getFeatureSize() != 0) {
PrintWriter wordPW = new PrintWriter(new File(sctmWordFile));
for (int i = 0; i < c.getFeatureSize(); i++) {
String wordName = c.getFeature(i);
wordPW.println(wordName);
}
wordPW.flush();
wordPW.close();
}
PrintWriter stnLengthPW = new PrintWriter(new File(stnLengthFile));
PrintWriter shortParentPW = new PrintWriter(new File(shortStnFile));
PrintWriter longParentPW = new PrintWriter(new File(longStnFile));
PrintWriter parentPW = new PrintWriter(new File(selectedSentencesinParentFile));
PrintWriter childPW = new PrintWriter(new File(selectedCommentsFile));
PrintWriter sctmParentPW = new PrintWriter(new File(sctmFormatParentFile));
PrintWriter sctmChildPW = new PrintWriter(new File(sctmFormatChildFile));
int totoalParentNum = 0;
TreeMap<Integer, _ParentDoc> parentMap = new TreeMap<Integer, _ParentDoc>();
int totalStnNum = 0;
ArrayList<_Doc> m_trainSet = c.getCollection();
ArrayList<Integer> parentNameList = new ArrayList<Integer>();
for (_Doc d : m_trainSet) {
if (d instanceof _ParentDoc) {
// HashMap<Integer, _Stn> stnMap = ((_ParentDoc)
// d).m_sentenceMap;
totoalParentNum += 1;
String parentName = d.getName();
parentMap.put(Integer.parseInt(parentName), (_ParentDoc) d);
parentNameList.add(Integer.parseInt(parentName));
}
}
ArrayList<Double> parentDocLenList = new ArrayList<Double>();
ArrayList<Double> childDocLenList = new ArrayList<Double>();
double parentDocLenSum = 0;
double childDocLenSum = 0;
for (int parentID : parentMap.keySet()) {
_ParentDoc parentObj = parentMap.get(parentID);
double parentDocLen = parentObj.getTotalDocLength();
parentDocLenSum += parentDocLen;
parentDocLenList.add(parentDocLen);
for (_ChildDoc cDoc : parentObj.m_childDocs) {
double childDocLen = cDoc.getTotalDocLength();
childDocLenList.add(childDocLen);
childDocLenSum += childDocLen;
}
_Stn[] sentenceArray = parentObj.getSentences();
int selectedStn = 0;
for (int i = 0; i < sentenceArray.length; i++) {
_Stn stnObj = sentenceArray[i];
if (stnObj == null)
continue;
selectedStn += 1;
stnLengthPW.println(stnObj.getLength());
// if(stnObj==null)
// continue;
// selectedStn += 1;
}
totalStnNum += selectedStn;
parentPW.print(parentID + "\t" + selectedStn + "\t");
shortParentPW.print(parentID + "\t");
longParentPW.print(parentID + "\t");
for (int i = 0; i < sentenceArray.length; i++) {
_Stn stnObj = sentenceArray[i];
if (stnObj == null)
continue;
if (stnObj.getLength() < 15)
shortParentPW.print((stnObj.getIndex() + 1) + "\t");
else
longParentPW.print((stnObj.getIndex() + 1) + "\t");
parentPW.print((stnObj.getIndex() + 1) + "\t");
}
parentPW.println();
longParentPW.println();
shortParentPW.println();
}
System.out.println("longest child\t" + Collections.max(childDocLenList));
System.out.println("shortest child\t" + Collections.min(childDocLenList));
System.out.println("parent doc len\t" + parentDocLenSum / parentDocLenList.size());
System.out.println("child doc len\t" + childDocLenSum / childDocLenList.size());
parentPW.flush();
parentPW.close();
stnLengthPW.flush();
stnLengthPW.close();
shortParentPW.flush();
shortParentPW.close();
longParentPW.flush();
longParentPW.close();
sctmParentPW.println(totoalParentNum);
sctmChildPW.println(totoalParentNum);
System.out.println("stnNum" + totalStnNum);
for (int parentID : parentMap.keySet()) {
_ParentDoc d = parentMap.get(parentID);
// HashMap<Integer, _Stn> stnMap = ((_ParentDoc)
// d).m_sentenceMap;
_Stn[] sentenceArray = (d).getSentences();
int selectedStn = 0;
for (int i = 0; i < sentenceArray.length; i++) {
_Stn stnObj = sentenceArray[i];
if (stnObj == null)
continue;
selectedStn += 1;
}
sctmParentPW.println(selectedStn);
for (int i = 0; i < sentenceArray.length; i++) {
_Stn stnObj = sentenceArray[i];
if (stnObj == null)
continue;
_SparseFeature[] sv = stnObj.getFv();
sctmParentPW.print((int) stnObj.getLength() + "\t");
for (int j = 0; j < sv.length; j++) {
int index = sv[j].getIndex();
double value = sv[j].getValue();
for (int v = 0; v < value; v++) sctmParentPW.print(index + "\t");
}
sctmParentPW.println();
}
ArrayList<_ChildDoc> childDocs = ((_ParentDoc) d).m_childDocs;
sctmChildPW.println(childDocs.size());
String parentName = d.getName();
TreeMap<Integer, _ChildDoc> childMap = new TreeMap<Integer, _ChildDoc>();
for (_ChildDoc cDoc : childDocs) {
String childName = cDoc.getName();
int childID = Integer.parseInt(childName.replace(parentName + "_", ""));
childMap.put(childID, cDoc);
}
childPW.print(parentName + "\t");
for (int t : childMap.keySet()) {
_ChildDoc cDoc = childMap.get(t);
sctmChildPW.print((int) cDoc.getTotalDocLength() + "\t");
childPW.print(cDoc.getName() + "\t");
// System.out.println(cDoc.getName() + "\t");
_SparseFeature[] fv = cDoc.getSparse();
for (int j = 0; j < fv.length; j++) {
int index = fv[j].getIndex();
double value = fv[j].getValue();
for (int v = 0; v < value; v++) {
sctmChildPW.print(index + "\t");
}
}
sctmChildPW.println();
}
childPW.println();
}
sctmParentPW.flush();
sctmParentPW.close();
sctmChildPW.flush();
sctmChildPW.close();
childPW.flush();
childPW.close();
} catch (Exception e) {
e.printStackTrace();
}
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class ACCTM method initialize_probability.
protected void initialize_probability(Collection<_Doc> collection) {
createSpace();
for (int i = 0; i < number_of_topics; i++) Arrays.fill(word_topic_sstat[i], d_beta);
// avoid adding such prior later on
Arrays.fill(m_sstat, d_beta * vocabulary_size);
for (_Doc d : collection) {
if (d instanceof _ParentDoc) {
d.setTopics4Gibbs(number_of_topics, 0);
for (_Stn stnObj : d.getSentences()) stnObj.setTopicsVct(number_of_topics);
} else if (d instanceof _ChildDoc) {
((_ChildDoc) d).setTopics4Gibbs_LDA(number_of_topics, 0);
computeMu4Doc((_ChildDoc) d);
}
for (_Word w : d.getWords()) {
word_topic_sstat[w.getTopic()][w.getIndex()]++;
m_sstat[w.getTopic()]++;
}
}
imposePrior();
m_statisticsNormalized = false;
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class ACCTM_C method initialize_probability.
protected void initialize_probability(Collection<_Doc> collection) {
createSpace();
for (int i = 0; i < number_of_topics; i++) Arrays.fill(word_topic_sstat[i], d_beta);
Arrays.fill(m_sstat, d_beta * vocabulary_size);
for (_Doc d : collection) {
if (d instanceof _ParentDoc) {
d.setTopics4Gibbs(number_of_topics, 0);
for (_Stn stnObj : d.getSentences()) stnObj.setTopicsVct(number_of_topics);
} else if (d instanceof _ChildDoc4BaseWithPhi) {
((_ChildDoc4BaseWithPhi) d).createXSpace(number_of_topics, m_gamma.length, vocabulary_size, d_beta);
((_ChildDoc4BaseWithPhi) d).setTopics4Gibbs(number_of_topics, 0);
computeMu4Doc((_ChildDoc) d);
}
if (d instanceof _ParentDoc) {
for (_Word w : d.getWords()) {
word_topic_sstat[w.getTopic()][w.getIndex()]++;
m_sstat[w.getTopic()]++;
}
} else if (d instanceof _ChildDoc4BaseWithPhi) {
for (_Word w : d.getWords()) {
int xid = w.getX();
int tid = w.getTopic();
int wid = w.getIndex();
// update global
if (xid == 0) {
word_topic_sstat[tid][wid]++;
m_sstat[tid]++;
}
}
}
}
imposePrior();
m_statisticsNormalized = false;
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class ACCTM_CZLR method initTest4Spam.
public void initTest4Spam(ArrayList<_Doc> sampleTestSet, _Doc d) {
_ParentDoc pDoc = (_ParentDoc) d;
pDoc.setTopics4Gibbs(number_of_topics, 0);
for (_Stn stnObj : pDoc.getSentences()) {
stnObj.setTopicsVct(number_of_topics);
}
sampleTestSet.add(pDoc);
for (_ChildDoc cDoc : pDoc.m_childDocs) {
((_ChildDoc4BaseWithPhi) cDoc).createXSpace(number_of_topics, m_gamma.length, vocabulary_size, d_beta);
((_ChildDoc4BaseWithPhi) cDoc).setTopics4Gibbs(number_of_topics, 0);
sampleTestSet.add(cDoc);
cDoc.setParentDoc(pDoc);
computeMu4Doc(cDoc);
}
setFeatures4Word(sampleTestSet);
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class ACCTM_C_test method rankStn4ChildBySim.
protected HashMap<Integer, Double> rankStn4ChildBySim(_ParentDoc pDoc, _ChildDoc cDoc) {
HashMap<Integer, Double> stnSimMap = new HashMap<Integer, Double>();
for (_Stn stnObj : pDoc.getSentences()) {
double stnKL = Utils.klDivergence(cDoc.m_xTopics[0], stnObj.m_topics);
stnSimMap.put(stnObj.getIndex() + 1, -stnKL);
}
return stnSimMap;
}
Aggregations