use of structures._Corpus in project IR_Base by Linda-sunshine.
the class BurstinessMain method main.
public static void main(String[] args) throws IOException, ParseException {
int mb = 1024 * 1024;
Runtime rTime = Runtime.getRuntime();
System.out.println("totalMem\t:" + rTime.totalMemory() / mb);
// Define the number of classes in this Naive Bayes.
int classNumber = 5;
// The default value is unigram.
int Ngram = 1;
// The way of calculating the feature value, which can also be "TFIDF", "BM25"
String featureValue = "TF";
// The way of normalization.(only 1 and 2)
int norm = 0;
// Document length threshold
int lengthThreshold = 5;
// each document should have at least 2 sentences
int minimunNumberofSentence = 2;
/**
***parameters for the two-topic topic model****
*/
// ACCTM, ACCTM_TwoTheta, ACCTM_C, ACCTM_CZ, ACCTM_CZLR, LDAonArticles, ACCTM_C,
// correspondence_LDA_Gibbs, LDA_Gibbs_Debug, LDA_Variational_multithread
// 2topic, pLSA, HTMM, LRHTMM, Tensor, LDA_Gibbs, LDA_Variational, HTSM, LRHTSM,
String topicmodel = "Burstiness";
String category = "tablet";
int number_of_topics = 30;
// false means in training there is no reviews from NewEgg
boolean loadNewEggInTrain = true;
// false means no shuffling and true means shuffling
boolean setRandomFold = true;
// 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
int loadAspectSentiPrior = 0;
// these two parameters must be larger than 1!!!
double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
// negative converge means do not need to check likelihood convergency
double converge = 1e-9, lambda = 0.9;
int varIter = 10;
double varConverge = 1e-5;
int topK = 20, number_of_iteration = 50, crossV = 1;
int gibbs_iteration = 500, gibbs_lag = 50;
int displayLap = 50;
gibbs_iteration = 4;
gibbs_lag = 2;
displayLap = 2;
double burnIn = 0.4;
boolean sentence = false;
// most popular items under each category from Amazon
// needed for docSummary
String[] tabletProductList = { "B008GFRDL0" };
String[] cameraProductList = { "B005IHAIMA" };
String[] phoneProductList = { "B00COYOAYW" };
String[] tvProductList = { "B0074FGLUM" };
/**
***The parameters used in loading files.****
*/
String amazonFolder = "./data/amazon/tablet/topicmodel";
String newEggFolder = "./data/NewEgg";
String articleType = "Tech";
// articleType = "Gadgets";
// articleType = "Yahoo";
// articleType = "APP";
String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
String suffix = ".json";
// Token model.
String tokenModel = "./data/Model/en-token.bin";
String stnModel = null;
String posModel = null;
if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
// Sentence model.
stnModel = "./data/Model/en-sent.bin";
// POS model.
posModel = "./data/Model/en-pos-maxent.bin";
sentence = true;
}
String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s.txt", Ngram, articleType);
String fvStatFile = String.format("./data/Features/fv_%dgram_stat_%s_%s.txt", Ngram, articleType, topicmodel);
String aspectList = "./data/Model/aspect_" + category + ".txt";
String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
String pathToPosWords = "./data/Model/SentiWordsPos.txt";
String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
String pathToNegationWords = "./data/Model/negation_words.txt";
String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
File rootFolder = new File("./data/results");
if (!rootFolder.exists()) {
System.out.println("creating root directory" + rootFolder);
rootFolder.mkdir();
}
SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyyMMdd-HHmm");
String filePrefix = String.format("./data/results/%s", dateFormatter.format(new Date()));
filePrefix = filePrefix + "-" + topicmodel + "-" + articleType;
File resultFolder = new File(filePrefix);
if (!resultFolder.exists()) {
System.out.println("creating directory" + resultFolder);
resultFolder.mkdir();
}
String outputFile = filePrefix + "/consoleOutput.txt";
PrintStream printStream = new PrintStream(new FileOutputStream(outputFile));
System.setOut(printStream);
String infoFilePath = filePrefix + "/Information.txt";
// //store top k words distribution over topic
String topWordPath = filePrefix + "/topWords.txt";
/**
***Parameters in feature selection.****
*/
String stopwords = "./data/Model/stopwords.dat";
// Feature selection method.
String featureSelection = "DF";
// Used in feature selection, the starting point of the features.
double startProb = 0.;
// Used in feature selection, the ending point of the features.
double endProb = 0.999;
// Filter the features with DFs smaller than this threshold.
int DFthreshold = 5;
System.out.println("Performing feature selection, wait...");
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
// newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
// analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
/**
*** parent child topic model ****
*/
ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
// analyzer.LoadDirectory(commentFolder, suffix);
if (topicmodel.equals("LDA_APPMerged"))
articleFolder = String.format("./data/ParentChildTopicModel/%sDescriptionsReviews", articleType);
// articleFolder = String.format(
// "./data/ParentChildTopicModel/%sArticles4Merged",
// articleType);
//
// commentFolder = String.format(
// "./data/ParentChildTopicModel/%sComments4Merged",
// articleType);
//
analyzer.LoadParentDirectory(articleFolder, suffix);
// analyzer.LoadDirectory(articleFolder, suffix);
// analyzer.LoadDirectory(commentFolder, suffix);
analyzer.LoadChildDirectory(commentFolder, suffix);
// if((topicmodel."LDA_APP")&&(topicmodel!="LDA_APPMerged"))
// analyzer.LoadChildDirectory(commentFolder, suffix);
// analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
System.out.println("Creating feature vectors, wait...");
// analyzer.LoadNewEggDirectory(newEggFolder, suffix); //Load all the documents as the data set.
// analyzer.LoadDirectory(amazonFolder, suffix);
analyzer.setFeatureValues(featureValue, norm);
// Get the collection of all the documents.
_Corpus c = analyzer.returnCorpus(fvStatFile);
// _Corpus c = analyzer.getCorpus();
// analyzer.generateFakeCorpus(filePrefix);
// analyzer.analyzeBurstiness(filePrefix);
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class languageModelBaseLine method main.
public static void main(String[] args) throws IOException, ParseException {
// Define the number of classes in this Naive Bayes.
int classNumber = 5;
// The default value is unigram.
int Ngram = 1;
// The way of calculating the feature value, which can also be "TFIDF",
// "BM25"
String featureValue = "BM25";
// The way of normalization.(only 1 and 2)
int norm = 0;
// Document length threshold
int lengthThreshold = 5;
// each document should have at least 2 sentences
int minimunNumberofSentence = 2;
/**
***parameters for the two-topic topic model****
*/
// 2topic, pLSA, HTMM, LRHTMM, Tensor, LDA_Gibbs, LDA_Variational, HTSM, LRHTSM, ParentChild_Gibbs
String topicmodel = "languageModel";
String category = "tablet";
int number_of_topics = 20;
// false means in training there is no reviews from NewEgg
boolean loadNewEggInTrain = true;
// false means no shuffling and true means shuffling
boolean setRandomFold = false;
// 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
int loadAspectSentiPrior = 0;
// these two parameters must be larger than 1!!!
double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
// negative converge means do not need to check likelihood convergency
double converge = 1e-9, lambda = 0.9;
int varIter = 10;
double varConverge = 1e-5;
int topK = 20, number_of_iteration = 50, crossV = 1;
int gibbs_iteration = 2000, gibbs_lag = 50;
gibbs_iteration = 4;
gibbs_lag = 2;
double burnIn = 0.4;
boolean display = true, sentence = false;
// most popular items under each category from Amazon
// needed for docSummary
String[] tabletProductList = { "B008GFRDL0" };
String[] cameraProductList = { "B005IHAIMA" };
String[] phoneProductList = { "B00COYOAYW" };
String[] tvProductList = { "B0074FGLUM" };
/**
***The parameters used in loading files.****
*/
String amazonFolder = "./data/amazon/tablet/topicmodel";
String newEggFolder = "./data/NewEgg";
String articleType = "Tech";
// articleType = "GadgetsArticles";
String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
String suffix = ".json";
// Token model.
String tokenModel = "./data/Model/en-token.bin";
String stnModel = null;
String posModel = null;
if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
// Sentence model.
stnModel = "./data/Model/en-sent.bin";
// POS model.
posModel = "./data/Model/en-pos-maxent.bin";
sentence = true;
}
String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s.txt", Ngram, articleType);
// String fvFile = String.format("./data/Features/fv_%dgram_topicmodel.txt", Ngram);
String fvStatFile = String.format("./data/Features/fv_%dgram_stat_topicmodel.txt", Ngram);
String aspectList = "./data/Model/aspect_" + category + ".txt";
String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
String pathToPosWords = "./data/Model/SentiWordsPos.txt";
String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
String pathToNegationWords = "./data/Model/negation_words.txt";
String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
File rootFolder = new File("./data/results");
if (!rootFolder.exists()) {
System.out.println("creating root directory" + rootFolder);
rootFolder.mkdir();
}
Calendar today = Calendar.getInstance();
String filePrefix = String.format("./data/results/%s-%s-%s%s-%s", today.get(Calendar.MONTH), today.get(Calendar.DAY_OF_MONTH), today.get(Calendar.HOUR_OF_DAY), today.get(Calendar.MINUTE), topicmodel);
File resultFolder = new File(filePrefix);
if (!resultFolder.exists()) {
System.out.println("creating directory" + resultFolder);
resultFolder.mkdir();
}
String infoFilePath = filePrefix + "/Information.txt";
// //store top k words distribution over topic
String topWordPath = filePrefix + "/topWords.txt";
/**
***Parameters in feature selection.****
*/
String stopwords = "./data/Model/stopwords.dat";
// Feature selection method.
String featureSelection = "DF";
// Used in feature selection, the starting point of the features.
double startProb = 0.5;
// Used in feature selection, the ending point of the features.
double endProb = 0.999;
// Filter the features with DFs smaller than this threshold.
int DFthreshold = 30;
// System.out.println("Performing feature selection, wait...");
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
// analyzer.LoadStopwords(stopwords);
// analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
// analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
System.out.println("Creating feature vectors, wait...");
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
// newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
/**
*** parent child topic model ****
*/
ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
analyzer.LoadParentDirectory(articleFolder, suffix);
analyzer.LoadChildDirectory(commentFolder, suffix);
// analyzer.LoadNewEggDirectory(newEggFolder, suffix); //Load all the documents as the data set.
// analyzer.LoadDirectory(amazonFolder, suffix);
// analyzer.setFeatureValues(featureValue, norm);
// Get the collection of all the documents.
_Corpus c = analyzer.returnCorpus(fvStatFile);
double mu = 800;
languageModelBaseLine lm = new languageModelBaseLine(c, mu);
lm.generateReferenceModel();
lm.printTopChild4Stn(filePrefix);
// bm25Corr.rankChild4Stn(c, TopChild4StnFile);
// bm25Corr.rankStn4Child(c, TopStn4ChildFile);
// bm25Corr.rankChild4Parent(c, TopChild4ParentFile);
// bm25Corr.discoverSpecificComments(c, similarityFile);
// String DFFile = filePrefix+"/df.txt";
// bm25Corr.outputDF(c, DFFile);
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class outputFile method outputFiles.
public static void outputFiles(String filePrefix, _Corpus c) {
try {
String selectedSentencesinParentFile = filePrefix + "/selected_Stn.txt";
String selectedCommentsFile = filePrefix + "/selected_Comments.txt";
String sctmFormatParentFile = filePrefix + "/abagf.AT.txt";
String sctmFormatChildFile = filePrefix + "/cbagf.AT.txt";
String sctmWordFile = filePrefix + "/words.AT.txt";
String stnLengthFile = filePrefix + "/selected_StnLength.txt";
String shortStnFile = filePrefix + "/selected_ShortStn.txt";
String longStnFile = filePrefix + "/selected_LongStn.txt";
if (c.getFeatureSize() != 0) {
PrintWriter wordPW = new PrintWriter(new File(sctmWordFile));
for (int i = 0; i < c.getFeatureSize(); i++) {
String wordName = c.getFeature(i);
wordPW.println(wordName);
}
wordPW.flush();
wordPW.close();
}
PrintWriter stnLengthPW = new PrintWriter(new File(stnLengthFile));
PrintWriter shortParentPW = new PrintWriter(new File(shortStnFile));
PrintWriter longParentPW = new PrintWriter(new File(longStnFile));
PrintWriter parentPW = new PrintWriter(new File(selectedSentencesinParentFile));
PrintWriter childPW = new PrintWriter(new File(selectedCommentsFile));
PrintWriter sctmParentPW = new PrintWriter(new File(sctmFormatParentFile));
PrintWriter sctmChildPW = new PrintWriter(new File(sctmFormatChildFile));
int totoalParentNum = 0;
TreeMap<Integer, _ParentDoc> parentMap = new TreeMap<Integer, _ParentDoc>();
int totalStnNum = 0;
ArrayList<_Doc> m_trainSet = c.getCollection();
ArrayList<Integer> parentNameList = new ArrayList<Integer>();
for (_Doc d : m_trainSet) {
if (d instanceof _ParentDoc) {
// HashMap<Integer, _Stn> stnMap = ((_ParentDoc)
// d).m_sentenceMap;
totoalParentNum += 1;
String parentName = d.getName();
parentMap.put(Integer.parseInt(parentName), (_ParentDoc) d);
parentNameList.add(Integer.parseInt(parentName));
}
}
ArrayList<Double> parentDocLenList = new ArrayList<Double>();
ArrayList<Double> childDocLenList = new ArrayList<Double>();
double parentDocLenSum = 0;
double childDocLenSum = 0;
for (int parentID : parentMap.keySet()) {
_ParentDoc parentObj = parentMap.get(parentID);
double parentDocLen = parentObj.getTotalDocLength();
parentDocLenSum += parentDocLen;
parentDocLenList.add(parentDocLen);
for (_ChildDoc cDoc : parentObj.m_childDocs) {
double childDocLen = cDoc.getTotalDocLength();
childDocLenList.add(childDocLen);
childDocLenSum += childDocLen;
}
_Stn[] sentenceArray = parentObj.getSentences();
int selectedStn = 0;
for (int i = 0; i < sentenceArray.length; i++) {
_Stn stnObj = sentenceArray[i];
if (stnObj == null)
continue;
selectedStn += 1;
stnLengthPW.println(stnObj.getLength());
// if(stnObj==null)
// continue;
// selectedStn += 1;
}
totalStnNum += selectedStn;
parentPW.print(parentID + "\t" + selectedStn + "\t");
shortParentPW.print(parentID + "\t");
longParentPW.print(parentID + "\t");
for (int i = 0; i < sentenceArray.length; i++) {
_Stn stnObj = sentenceArray[i];
if (stnObj == null)
continue;
if (stnObj.getLength() < 15)
shortParentPW.print((stnObj.getIndex() + 1) + "\t");
else
longParentPW.print((stnObj.getIndex() + 1) + "\t");
parentPW.print((stnObj.getIndex() + 1) + "\t");
}
parentPW.println();
longParentPW.println();
shortParentPW.println();
}
System.out.println("longest child\t" + Collections.max(childDocLenList));
System.out.println("shortest child\t" + Collections.min(childDocLenList));
System.out.println("parent doc len\t" + parentDocLenSum / parentDocLenList.size());
System.out.println("child doc len\t" + childDocLenSum / childDocLenList.size());
parentPW.flush();
parentPW.close();
stnLengthPW.flush();
stnLengthPW.close();
shortParentPW.flush();
shortParentPW.close();
longParentPW.flush();
longParentPW.close();
sctmParentPW.println(totoalParentNum);
sctmChildPW.println(totoalParentNum);
System.out.println("stnNum" + totalStnNum);
for (int parentID : parentMap.keySet()) {
_ParentDoc d = parentMap.get(parentID);
// HashMap<Integer, _Stn> stnMap = ((_ParentDoc)
// d).m_sentenceMap;
_Stn[] sentenceArray = (d).getSentences();
int selectedStn = 0;
for (int i = 0; i < sentenceArray.length; i++) {
_Stn stnObj = sentenceArray[i];
if (stnObj == null)
continue;
selectedStn += 1;
}
sctmParentPW.println(selectedStn);
for (int i = 0; i < sentenceArray.length; i++) {
_Stn stnObj = sentenceArray[i];
if (stnObj == null)
continue;
_SparseFeature[] sv = stnObj.getFv();
sctmParentPW.print((int) stnObj.getLength() + "\t");
for (int j = 0; j < sv.length; j++) {
int index = sv[j].getIndex();
double value = sv[j].getValue();
for (int v = 0; v < value; v++) sctmParentPW.print(index + "\t");
}
sctmParentPW.println();
}
ArrayList<_ChildDoc> childDocs = ((_ParentDoc) d).m_childDocs;
sctmChildPW.println(childDocs.size());
String parentName = d.getName();
TreeMap<Integer, _ChildDoc> childMap = new TreeMap<Integer, _ChildDoc>();
for (_ChildDoc cDoc : childDocs) {
String childName = cDoc.getName();
int childID = Integer.parseInt(childName.replace(parentName + "_", ""));
childMap.put(childID, cDoc);
}
childPW.print(parentName + "\t");
for (int t : childMap.keySet()) {
_ChildDoc cDoc = childMap.get(t);
sctmChildPW.print((int) cDoc.getTotalDocLength() + "\t");
childPW.print(cDoc.getName() + "\t");
// System.out.println(cDoc.getName() + "\t");
_SparseFeature[] fv = cDoc.getSparse();
for (int j = 0; j < fv.length; j++) {
int index = fv[j].getIndex();
double value = fv[j].getValue();
for (int v = 0; v < value; v++) {
sctmChildPW.print(index + "\t");
}
}
sctmChildPW.println();
}
childPW.println();
}
sctmParentPW.flush();
sctmParentPW.close();
sctmChildPW.flush();
sctmChildPW.close();
childPW.flush();
childPW.close();
} catch (Exception e) {
e.printStackTrace();
}
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class outputFile method main.
public static void main(String[] args) throws IOException, ParseException {
// Define the number of classes in this Naive Bayes.
int classNumber = 5;
// The default value is unigram.
int Ngram = 1;
// The way of calculating the feature value, which can also be "TFIDF", "BM25"
String featureValue = "TF";
// The way of normalization.(only 1 and 2)
int norm = 0;
// Document length threshold
int lengthThreshold = 5;
// each document should have at least 2 sentences
int minimunNumberofSentence = 2;
/**
***parameters for the two-topic topic model****
*/
// 2topic, pLSA, HTMM, LRHTMM, Tensor,
String topicmodel = "wsdm";
// LDA_Gibbs, LDA_Variational, HTSM, LRHTSM,
// ParentChild_Gibbs
String category = "tablet";
int number_of_topics = 20;
// false means in training there is no reviews from NewEgg
boolean loadNewEggInTrain = true;
// false means no shuffling and true means shuffling
boolean setRandomFold = true;
// 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
int loadAspectSentiPrior = 0;
// these two parameters must be larger than 1!!!
double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
// negative converge means do not need to check likelihood convergency
double converge = -1e-9, lambda = 0.9;
int varIter = 10;
double varConverge = 1e-5;
int topK = 20, number_of_iteration = 50, crossV = 10;
int gibbs_iteration = 2000, gibbs_lag = 50;
gibbs_iteration = 10;
gibbs_lag = 2;
double burnIn = 0.4;
boolean display = true, sentence = false;
// most popular items under each category from Amazon
// needed for docSummary
String[] tabletProductList = { "B008GFRDL0" };
String[] cameraProductList = { "B005IHAIMA" };
String[] phoneProductList = { "B00COYOAYW" };
String[] tvProductList = { "B0074FGLUM" };
/**
***The parameters used in loading files.****
*/
String amazonFolder = "./data/amazon/tablet/topicmodel";
String newEggFolder = "./data/NewEgg";
String articleType = "Tech";
articleType = "Yahoo";
// articleType = "Gadgets";
// articleType = "APP";
String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
articleFolder = String.format("../../Code/Data/TextMiningProject/APPDescriptions");
commentFolder = String.format("../../Code/Data/TextMiningProject/APPReviews");
String suffix = ".json";
// Token model.
String tokenModel = "./data/Model/en-token.bin";
String stnModel = null;
String posModel = null;
if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
// Sentence model.
stnModel = "./data/Model/en-sent.bin";
// POS model.
posModel = "./data/Model/en-pos-maxent.bin";
sentence = true;
}
String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s.txt", Ngram, articleType);
// String fvFile = String.format("./data/Features/fv_%dgram_topicmodel.txt", Ngram);
String fvStatFile = String.format("./data/Features/fv_%dgram_stat_topicmodel.txt", Ngram);
String aspectList = "./data/Model/aspect_" + category + ".txt";
String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
String pathToPosWords = "./data/Model/SentiWordsPos.txt";
String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
String pathToNegationWords = "./data/Model/negation_words.txt";
String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
File rootFolder = new File("./data/results");
if (!rootFolder.exists()) {
System.out.println("creating root directory" + rootFolder);
rootFolder.mkdir();
}
Calendar today = Calendar.getInstance();
String filePrefix = String.format("./data/results/%s-%s-%s%s-%s-%s", 1 + today.get(Calendar.MONTH), today.get(Calendar.DAY_OF_MONTH), today.get(Calendar.HOUR_OF_DAY), today.get(Calendar.MINUTE), topicmodel, articleType);
File resultFolder = new File(filePrefix);
if (!resultFolder.exists()) {
System.out.println("creating directory" + resultFolder);
resultFolder.mkdir();
}
String infoFilePath = filePrefix + "/Information.txt";
// //store top k words distribution over topic
String topWordPath = filePrefix + "/topWords.txt";
/**
***Parameters in feature selection.****
*/
String stopwords = "./data/Model/stopwords.dat";
// Feature selection method.
String featureSelection = "DF";
// Used in feature selection, the starting point of the features.
double startProb = 0.0;
// Used in feature selection, the ending point of the features.
double endProb = 0.95;
// Filter the features with DFs smaller than this threshold.
int DFthreshold = 3;
System.out.println("Performing feature selection, wait...");
ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
// analyzer.LoadStopwords(stopwords);
// analyzer.LoadParentDirectory(articleFolder, suffix);
// analyzer.LoadChildDirectory(commentFolder, suffix);
analyzer.LoadDirectory(commentFolder, suffix);
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
// analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
// analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
System.out.println("Creating feature vectors, wait...");
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
// newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
/**
*** parent child topic model ****
*/
// ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
// analyzer.LoadParentDirectory(TechArticlesFolder, suffix);
// analyzer.LoadChildDirectory(TechCommentsFolder, suffix);
// analyzer.LoadDirectory(TechArticlesFolder, suffix);
// analyzer.LoadDirectory(TechCommentsFolder, suffix);
// analyzer.setFeatureValues(featureValue, norm);
// Get the collection of all the documents.
_Corpus c = analyzer.returnCorpus(fvStatFile);
statisticDocLen(c);
// outputFiles(filePrefix, c);
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class outputFile method statisticDocLen.
public static void statisticDocLen(_Corpus c) {
ArrayList<Double> childDocLenList = new ArrayList<Double>();
double childDocLenSum = 0;
ArrayList<_Doc> m_trainSet = c.getCollection();
for (_Doc d : m_trainSet) {
double childDocLen = d.getTotalDocLength();
childDocLenList.add(childDocLen);
childDocLenSum += childDocLen;
}
System.out.println("longest child\t" + Collections.max(childDocLenList));
System.out.println("shortest child\t" + Collections.min(childDocLenList));
// System.out.println("parent doc len\t"+parentDocLenSum/parentDocLenList.size());
System.out.println("child doc len\t" + childDocLenSum / childDocLenList.size());
}
Aggregations