use of structures._Corpus in project IR_Base by Linda-sunshine.
the class LDAGibbs4AC method crossValidation.
public void crossValidation(int k) {
m_trainSet = new ArrayList<_Doc>();
m_testSet = new ArrayList<_Doc>();
double[] perf = null;
_Corpus parentCorpus = new _Corpus();
ArrayList<_Doc> docs = m_corpus.getCollection();
ArrayList<_ParentDoc> parentDocs = new ArrayList<_ParentDoc>();
for (_Doc d : docs) {
if (d instanceof _ParentDoc) {
parentCorpus.addDoc(d);
parentDocs.add((_ParentDoc) d);
}
}
System.out.println("size of parent docs\t" + parentDocs.size());
parentCorpus.setMasks();
if (m_randomFold == true) {
perf = new double[k];
parentCorpus.shuffle(k);
int[] masks = parentCorpus.getMasks();
for (int i = 0; i < k; i++) {
for (int j = 0; j < masks.length; j++) {
if (masks[j] == i) {
m_testSet.add(parentDocs.get(j));
} else {
m_trainSet.add(parentDocs.get(j));
for (_ChildDoc d : parentDocs.get(j).m_childDocs) {
m_trainSet.add(d);
}
}
}
// writeFile(i, m_trainSet, m_testSet);
System.out.println("Fold number " + i);
infoWriter.println("Fold number " + i);
System.out.println("Train Set Size " + m_trainSet.size());
infoWriter.println("Train Set Size " + m_trainSet.size());
System.out.println("Test Set Size " + m_testSet.size());
infoWriter.println("Test Set Size " + m_testSet.size());
long start = System.currentTimeMillis();
EM();
perf[i] = Evaluation(i);
System.out.format("%s Train/Test finished in %.2f seconds...\n", this.toString(), (System.currentTimeMillis() - start) / 1000.0);
infoWriter.format("%s Train/Test finished in %.2f seconds...\n", this.toString(), (System.currentTimeMillis() - start) / 1000.0);
if (i < k - 1) {
m_trainSet.clear();
m_testSet.clear();
}
}
}
double mean = Utils.sumOfArray(perf) / k, var = 0;
for (int i = 0; i < perf.length; i++) var += (perf[i] - mean) * (perf[i] - mean);
var = Math.sqrt(var / k);
System.out.format("Perplexity %.3f+/-%.3f\n", mean, var);
infoWriter.format("Perplexity %.3f+/-%.3f\n", mean, var);
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class TopicModelMain method main.
public static void main(String[] args) throws IOException, ParseException {
int mb = 1024 * 1024;
Runtime rTime = Runtime.getRuntime();
System.out.println("totalMem\t:" + rTime.totalMemory() / mb);
// Define the number of classes in this Naive Bayes.
int classNumber = 5;
// The default value is unigram.
int Ngram = 1;
// The way of calculating the feature value, which can also be "TFIDF", "BM25"
String featureValue = "TF";
// The way of normalization.(only 1 and 2)
int norm = 0;
// Document length threshold
int lengthThreshold = 5;
// each document should have at least 2 sentences
int minimunNumberofSentence = 2;
/**
***parameters for the two-topic topic model****
*/
// ACCTM, ACCTM_TwoTheta, ACCTM_C, ACCTM_CZ, ACCTM_CZLR, LDAonArticles, ACCTM_C,
// correspondence_LDA_Gibbs, LDA_Gibbs_Debug, LDA_Variational_multithread
// 2topic, pLSA, HTMM, LRHTMM, Tensor, LDA_Gibbs, LDA_Variational, HTSM, LRHTSM,
// LDAGibbs4AC_test, DCMCorrLDA_multi_E_test,DCMLDA4AC_test, DCMDMCorrLDA_multi_E_test
// DCMDMCorrLDA_test, DCMDMMCorrLDA_test, corrLDA_Gibbs_test,
// DCMCorrLDA_Multi_EM, sparseDCMLDA_test, DCMLDA_test, sparseLDA_test, LDA_Gibbs_test
// sparseClusterDCMLDA, sparseClusterDCMLDA_test
String topicmodel = "weightedCorrespondenceModel_test";
String category = "tablet";
int number_of_topics = 15;
// false means in training there is no reviews from NewEgg
boolean loadNewEggInTrain = true;
// false means no shuffling and true means shuffling
boolean setRandomFold = true;
// 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
int loadAspectSentiPrior = 0;
// these two parameters must be larger than 1!!!
double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
// negative converge means do not need to check likelihood convergency
double converge = 1e-9, lambda = 0.9;
int varIter = 10;
double varConverge = 1e-5;
int topK = 20, number_of_iteration = 50, crossV = 1;
int gibbs_iteration = 1000, gibbs_lag = 50;
int displayLap = 20;
// gibbs_iteration = 50;
// gibbs_lag = 20;
// displayLap = 20;
double burnIn = 0.4;
boolean sentence = false;
// most popular items under each category from Amazon
// needed for docSummary
String[] tabletProductList = { "B008GFRDL0" };
String[] cameraProductList = { "B005IHAIMA" };
String[] phoneProductList = { "B00COYOAYW" };
String[] tvProductList = { "B0074FGLUM" };
/**
***The parameters used in loading files.****
*/
String amazonFolder = "./data/amazon/tablet/topicmodel";
String newEggFolder = "./data/NewEgg";
String articleType = "Tech";
// articleType = "Reuters";
// articleType = "Gadgets";
// articleType = "Yahoo";
// articleType = "APP";
String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
// articleFolder = String.format(
// "./data/ParentChildTopicModel/Reuters",
// articleType);
String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
String suffix = ".json";
// Token model.
String tokenModel = "./data/Model/en-token.bin";
String stnModel = null;
String posModel = null;
if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
// Sentence model.
stnModel = "./data/Model/en-sent.bin";
// POS model.
posModel = "./data/Model/en-pos-maxent.bin";
sentence = true;
}
String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s.txt", Ngram, articleType);
String fvStatFile = String.format("./data/Features/fv_%dgram_stat_%s_%s.txt", Ngram, articleType, topicmodel);
String aspectList = "./data/Model/aspect_" + category + ".txt";
String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
String pathToPosWords = "./data/Model/SentiWordsPos.txt";
String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
String pathToNegationWords = "./data/Model/negation_words.txt";
String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
File rootFolder = new File("./data/results");
if (!rootFolder.exists()) {
System.out.println("creating root directory" + rootFolder);
rootFolder.mkdir();
}
SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyyMMdd-HHmm");
String filePrefix = String.format("./data/results/%s", dateFormatter.format(new Date()));
filePrefix = filePrefix + "-" + topicmodel + "-" + articleType;
File resultFolder = new File(filePrefix);
if (!resultFolder.exists()) {
System.out.println("creating directory" + resultFolder);
resultFolder.mkdir();
}
String outputFile = filePrefix + "/consoleOutput.txt";
PrintStream printStream = new PrintStream(new FileOutputStream(outputFile));
System.setOut(printStream);
String infoFilePath = filePrefix + "/Information.txt";
// //store top k words distribution over topic
String topWordPath = filePrefix + "/topWords.txt";
/**
***Parameters in feature selection.****
*/
String stopwords = "./data/Model/stopwords.dat";
// Feature selection method.
String featureSelection = "DF";
// Used in feature selection, the starting point of the features.
double startProb = 0.;
// Used in feature selection, the ending point of the features.
double endProb = 0.999;
// Filter the features with DFs smaller than this threshold.
int DFthreshold = 5;
System.out.println("Performing feature selection, wait...");
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
// newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
// analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
/**
*** parent child topic model ****
*/
ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
// analyzer.LoadDirectory(commentFolder, suffix);
if (topicmodel.equals("LDA_APPMerged"))
articleFolder = String.format("./data/ParentChildTopicModel/%sDescriptionsReviews", articleType);
// articleFolder = String.format(
// "./data/ParentChildTopicModel/%sArticles4Merged",
// articleType);
//
// commentFolder = String.format(
// "./data/ParentChildTopicModel/%sComments4Merged",
// articleType);
//
analyzer.LoadParentDirectory(articleFolder, suffix);
// analyzer.LoadDirectory(articleFolder, suffix);
analyzer.LoadChildDirectory(commentFolder, suffix);
// analyzer.LoadChildDirectory(commentFolder, suffix);
// if((topicmodel."LDA_APP")&&(topicmodel!="LDA_APPMerged"))
// analyzer.LoadChildDirectory(commentFolder, suffix);
// analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
System.out.println("Creating feature vectors, wait...");
// analyzer.LoadNewEggDirectory(newEggFolder, suffix); //Load all the documents as the data set.
// analyzer.LoadDirectory(amazonFolder, suffix);
analyzer.setFeatureValues(featureValue, norm);
// Get the collection of all the documents.
_Corpus c = analyzer.returnCorpus(fvStatFile);
if (topicmodel.equals("2topic")) {
twoTopic model = new twoTopic(number_of_iteration, converge, beta, c, lambda);
if (crossV <= 1) {
for (_Doc d : c.getCollection()) {
model.inference(d);
model.printTopWords(topK);
}
} else
model.crossValidation(crossV);
} else {
pLSA model = null;
if (topicmodel.equals("pLSA")) {
model = new pLSA_multithread(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha);
} else if (topicmodel.equals("LDA_Gibbs")) {
// number_of_topics = 15;
model = new // in gibbs sampling, no need to compute log-likelihood during sampling
LDA_Gibbs(// in gibbs sampling, no need to compute log-likelihood during sampling
gibbs_iteration, // in gibbs sampling, no need to compute log-likelihood during sampling
0, // in gibbs sampling, no need to compute log-likelihood during sampling
beta, // in gibbs sampling, no need to compute log-likelihood during sampling
c, lambda, number_of_topics, alpha, burnIn, gibbs_lag);
} else if (topicmodel.equals("LDA_Variational_multithread")) {
model = new LDA_Variational_multithread(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha, varIter, varConverge);
} else if (topicmodel.equals("HTMM")) {
model = new HTMM(number_of_iteration, converge, beta, c, number_of_topics, alpha);
} else if (topicmodel.equals("HTSM")) {
model = new HTSM(number_of_iteration, converge, beta, c, number_of_topics, alpha);
} else if (topicmodel.equals("LRHTMM")) {
model = new LRHTMM(number_of_iteration, converge, beta, c, number_of_topics, alpha, lambda);
} else if (topicmodel.equals("LRHTSM")) {
model = new LRHTSM(number_of_iteration, converge, beta, c, number_of_topics, alpha, lambda);
} else if (topicmodel.equals("correspondence_LDA_Gibbs")) {
double ksi = 800;
double tau = 0.7;
model = new // in gibbs sampling, no need to compute log-likelihood during sampling
corrLDA_Gibbs(// in gibbs sampling, no need to compute log-likelihood during sampling
gibbs_iteration, // in gibbs sampling, no need to compute log-likelihood during sampling
0, // in gibbs sampling, no need to compute log-likelihood during sampling
beta - 1, // in gibbs sampling, no need to compute log-likelihood during sampling
c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag);
} else if (topicmodel.equals("ACCTM")) {
double mu = 1.0;
double[] gamma = { 0.5, 0.5 };
double ksi = 800;
double tau = 0.7;
model = new ACCTM(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag);
} else if (topicmodel.equals("ACCTM_C")) {
double mu = 1.0;
double[] gamma = { 0.5, 0.5 };
beta = 1.001;
double ksi = 800;
double tau = 0.7;
converge = 1e-5;
model = new ACCTM_C(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, gamma);
} else if (topicmodel.equals("ACCTM_CHard")) {
double mu = 1.0;
double[] gamma = { 0.5, 0.5 };
double ksi = 800;
double tau = 0.7;
beta = 1.001;
model = new ACCTM_CHard(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, gamma);
} else if (topicmodel.equals("ACCTM_CZ")) {
double mu = 1.0;
double[] gamma = { 0.5, 0.5 };
beta = 1.001;
alpha = 1.01;
double ksi = 800;
double tau = 0.7;
// number_of_topics = 30;
model = new ACCTM_CZ(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, gamma);
} else if (topicmodel.equals("ACCTM_CZLR")) {
double mu = 1.0;
double[] gamma = { 0.5, 0.5 };
beta = 1.001;
alpha = 1.01;
double ksi = 800;
double tau = 0.7;
// number_of_topics = 30;
converge = 1e-9;
model = new ACCTM_CZLR(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, gamma);
} else if (topicmodel.equals("DCMLDA_test")) {
converge = 1e-3;
int newtonIter = 50;
double newtonConverge = 1e-3;
// number_of_topics = 15;
model = new DCMLDA_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, newtonIter, newtonConverge);
String priorFile = "./data/Features/" + articleType + "TopicWord.txt";
model.LoadPrior(priorFile, eta);
} else if (topicmodel.equals("LDA_Gibbs_test")) {
// number_of_topics = 15;
// in gibbs sampling, no need to compute
// log-likelihood during sampling
model = new LDA_Gibbs_test(gibbs_iteration, 0, beta, c, lambda, number_of_topics, alpha, burnIn, gibbs_lag);
} else if (topicmodel.equals("DCMCorrLDA_multi_E_test")) {
converge = 1e-2;
int newtonIter = 30;
double newtonConverge = 1e-2;
gibbs_iteration = 40;
gibbs_lag = 10;
double ksi = 800;
double tau = 0.7;
double alphaC = 0.001;
model = new DCMCorrLDA_multi_E_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, alphaC, burnIn, ksi, tau, gibbs_lag, newtonIter, newtonConverge);
String priorFile = "./data/Features/" + articleType + "TopWords.txt";
model.LoadPrior(priorFile, eta);
} else if (topicmodel.equals("LDAGibbs4AC_test")) {
double ksi = 800;
double tau = 0.7;
model = new LDAGibbs4AC_test(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, ksi, tau);
} else if (topicmodel.equals("DCMLDA4AC_test")) {
// number_of_topics = 5;
converge = 1e-3;
double ksi = 800;
double tau = 0.7;
int newtonIter = 1000;
double newtonConverge = 1e-3;
model = new DCMLDA4AC_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, ksi, tau, newtonIter, newtonConverge);
} else if (topicmodel.equals("DCMCorrLDA_test")) {
// number_of_topics = 15;
converge = 1e-3;
int newtonIter = 50;
double newtonConverge = 1e-3;
double ksi = 800;
double tau = 0.7;
double alphaC = 0.001;
model = new DCMCorrLDA_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, alphaC, burnIn, ksi, tau, gibbs_lag, newtonIter, newtonConverge);
} else if (topicmodel.equals("sparseDCMLDA_test")) {
converge = 1e-3;
int newtonIter = 50;
double newtonConverge = 1e-3;
// number_of_topics = 15;
double tParam = 1;
double sParam = 1;
model = new sparseDCMLDA_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, newtonIter, newtonConverge, tParam, sParam);
/*
* String priorFile = "./data/Features/" + articleType +
* "TopicWord.txt"; model.LoadPrior(priorFile, eta);
*/
} else if (topicmodel.equals("sparseLDA_test")) {
converge = 1e-3;
// number_of_topics = 15;
double tParam = 1;
double sParam = 1;
model = new sparseLDA_test(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, tParam, sParam);
} else if (topicmodel.equals("sparseClusterDCMLDA_test")) {
converge = 1e-3;
int newtonIter = 50;
double newtonConverge = 1e-3;
double tParam = 1;
double sParam = 1;
double gammaParam = 0.01;
int clusterNum = 1;
model = new sparseClusterDCMLDA_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, newtonIter, newtonConverge, tParam, sParam, clusterNum, gammaParam);
/*
* String priorFile = "./data/Features/" + articleType +
* "TopicWord.txt"; model.LoadPrior(priorFile, eta);
*/
} else if (topicmodel.equals("weightedCorrespondenceModel_test")) {
beta = beta - 1;
alpha = alpha - 1;
// number_of_iteration = 2;
double lbfgsConverge = varConverge;
converge = 1e-3;
model = new weightedCorrespondenceModel_test(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha, varIter, varConverge, lbfgsConverge);
//
// String priorFile = "./data/Features/" + articleType + "TopicWord.txt";
// model.LoadPrior(priorFile, eta);eta
}
model.setDisplayLap(displayLap);
model.setInforWriter(infoFilePath);
if (loadAspectSentiPrior == 1) {
System.out.println("Loading aspect-senti list from " + aspectSentiList);
model.setSentiAspectPrior(true);
model.LoadPrior(aspectSentiList, eta);
} else if (loadAspectSentiPrior == 2) {
System.out.println("Loading aspect list from " + aspectList);
model.setSentiAspectPrior(false);
model.LoadPrior(aspectList, eta);
} else {
System.out.println("No prior is added!!");
}
if (crossV <= 1) {
model.EMonCorpus();
if (topWordPath == null)
model.printTopWords(topK);
else
model.printTopWords(topK, topWordPath);
} else {
model.setRandomFold(setRandomFold);
double trainProportion = 0.8;
double testProportion = 1 - trainProportion;
model.setPerplexityProportion(testProportion);
model.crossValidation(crossV);
model.printTopWords(topK, topWordPath);
}
model.closeWriter();
if (sentence) {
String summaryFilePath = "./data/results/Topics_" + number_of_topics + "_Summary.txt";
model.setSummaryWriter(summaryFilePath);
if (category.equalsIgnoreCase("camera"))
((HTMM) model).docSummary(cameraProductList);
else if (category.equalsIgnoreCase("tablet"))
((HTMM) model).docSummary(tabletProductList);
else if (category.equalsIgnoreCase("phone"))
((HTMM) model).docSummary(phoneProductList);
else if (category.equalsIgnoreCase("tv"))
((HTMM) model).docSummary(tvProductList);
}
}
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class VectorReviewMain method main.
public static void main(String[] args) throws IOException, ParseException {
/**
***Set these parameters before run the classifiers.****
*/
// Define the number of classes in this Naive Bayes.
int classNumber = 5;
// Document length threshold
int lengthThreshold = 5;
// k fold-cross validation
int CVFold = 10;
// Supervised classification models: "NB", "LR", "PR-LR", "SVM"
// Semi-supervised classification models: "GF", "GF-RW", "GF-RW-ML"
// Which classifier to use.
String classifier = "GF-RW-ML";
// String modelPath = "./data/Model/";
double C = 1.0;
// "SUP", "SEMI"
String style = "SEMI";
String multipleLearner = "SVM";
/**
***The parameters used in loading files.****
*/
String featureLocation = "data/Features/fv_2gram_BM25_CHI_small.txt";
String vctfile = "data/FVs/vct_2gram_BM25_CHI_tablet_small.dat";
// String featureLocation = "data/Features/fv_fake.txt";
// String vctfile = "data/Fvs/LinearRegression.dat";
/**
***Parameters in time series analysis.****
*/
// String debugOutput = String.format("data/debug/%s.sim.pair", classifier);
String debugOutput = null;
/**
**Pre-process the data.****
*/
// Feture selection.
System.out.println("Loading vectors from file, wait...");
VctAnalyzer analyzer = new VctAnalyzer(classNumber, lengthThreshold, featureLocation);
// Load all the documents as the data set.
analyzer.LoadDoc(vctfile);
_Corpus corpus = analyzer.getCorpus();
// make it binary
corpus.mapLabels(4);
/**
******Choose different classification methods.********
*/
if (style.equals("SUP")) {
if (classifier.equals("NB")) {
// Define a new naive bayes with the parameters.
System.out.println("Start naive bayes, wait...");
NaiveBayes myNB = new NaiveBayes(corpus);
// Use the movie reviews for testing the codes.
myNB.crossValidation(CVFold, corpus);
} else if (classifier.equals("KNN")) {
// Define a new naive bayes with the parameters.
System.out.println("Start kNN, wait...");
KNN myKNN = new KNN(corpus, 10, 1);
// Use the movie reviews for testing the codes.
myKNN.crossValidation(CVFold, corpus);
} else if (classifier.equals("LR")) {
// Define a new logistics regression with the parameters.
System.out.println("Start logistic regression, wait...");
LogisticRegression myLR = new LogisticRegression(corpus, C);
myLR.setDebugOutput(debugOutput);
// Use the movie reviews for testing the codes.
myLR.crossValidation(CVFold, corpus);
// myLR.saveModel(modelPath + "LR.model");
} else if (classifier.equals("PRLR")) {
// Define a new logistics regression with the parameters.
System.out.println("Start posterior regularized logistic regression, wait...");
PRLogisticRegression myLR = new PRLogisticRegression(corpus, C);
myLR.setDebugOutput(debugOutput);
// Use the movie reviews for testing the codes.
myLR.crossValidation(CVFold, corpus);
// myLR.saveModel(modelPath + "LR.model");
} else if (classifier.equals("SVM")) {
System.out.println("Start SVM, wait...");
SVM mySVM = new SVM(corpus, C);
mySVM.crossValidation(CVFold, corpus);
} else if (classifier.equals("PR")) {
System.out.println("Start PageRank, wait...");
PageRank myPR = new PageRank(corpus, C, 100, 50, 1e-6);
myPR.train(corpus.getCollection());
} else
System.out.println("Classifier has not been developed yet!");
} else if (style.equals("SEMI")) {
double learningRatio = 1.0;
// k nearest labeled, k' nearest unlabeled
int k = 20, kPrime = 20;
// labeled data weight, unlabeled data weight
double tAlpha = 1.0, tBeta = 0.1;
// convergence of random walk, weight of random walk
double tDelta = 1e-4, tEta = 0.5;
boolean simFlag = false;
double threshold = 0.5;
// bound for generating rating constraints (must be zero in binary case)
int bound = 0;
boolean metricLearning = true;
if (classifier.equals("GF")) {
GaussianFields mySemi = new GaussianFields(corpus, multipleLearner, C);
mySemi.crossValidation(CVFold, corpus);
} else if (classifier.equals("GF-RW")) {
GaussianFields mySemi = new GaussianFieldsByRandomWalk(corpus, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, false);
mySemi.setDebugOutput(debugOutput);
mySemi.crossValidation(CVFold, corpus);
} else if (classifier.equals("GF-RW-ML")) {
LinearSVMMetricLearning lMetricLearner = new LinearSVMMetricLearning(corpus, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, false, bound);
lMetricLearner.setMetricLearningMethod(metricLearning);
lMetricLearner.setDebugOutput(debugOutput);
lMetricLearner.crossValidation(CVFold, corpus);
} else
System.out.println("Classifier has not been developed yet!");
} else
System.out.println("Learning paradigm has not been developed yet!");
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class featureGeneration method main.
public static void main(String[] args) throws IOException, ParseException {
// Define the number of classes in this Naive Bayes.
int classNumber = 5;
// The default value is unigram.
int Ngram = 1;
// The way of calculating the feature value, which can also be "TFIDF", "BM25"
String featureValue = "TF";
// The way of normalization.(only 1 and 2)
int norm = 0;
// Document length threshold
int lengthThreshold = 5;
// each document should have at least 2 sentences
int minimunNumberofSentence = 2;
/**
***parameters for the two-topic topic model****
*/
// 2topic, pLSA, HTMM, LRHTMM, Tensor, LDA_Gibbs, LDA_Variational, HTSM, LRHTSM, ParentChild_Gibbs
String topicmodel = "featureGeneration";
String category = "tablet";
int number_of_topics = 20;
// false means in training there is no reviews from NewEgg
boolean loadNewEggInTrain = true;
// false means no shuffling and true means shuffling
boolean setRandomFold = true;
// 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
int loadAspectSentiPrior = 0;
// these two parameters must be larger than 1!!!
double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
// negative converge means do not need to check likelihood convergency
double converge = -1e-9, lambda = 0.9;
int varIter = 10;
double varConverge = 1e-5;
int topK = 20, number_of_iteration = 50, crossV = 10;
int gibbs_iteration = 2000, gibbs_lag = 50;
// gibbs_iteration = 10;
// gibbs_lag = 2;
double burnIn = 0.4;
boolean display = true, sentence = false;
// most popular items under each category from Amazon
// needed for docSummary
String[] tabletProductList = { "B008GFRDL0" };
String[] cameraProductList = { "B005IHAIMA" };
String[] phoneProductList = { "B00COYOAYW" };
String[] tvProductList = { "B0074FGLUM" };
/**
***The parameters used in loading files.****
*/
String amazonFolder = "./data/amazon/tablet/topicmodel";
String newEggFolder = "./data/NewEgg";
String articleType = "Tech";
// articleType = "Yahoo";
// articleType = "Gadgets";
// articleType = "APP";
String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
// articleFolder = "../../Code/Data/TextMiningProject/APPDescriptions";
// commentFolder = "../../Code/Data/TextMiningProject/APPReviews";
String suffix = ".json";
// Token model.
String tokenModel = "./data/Model/en-token.bin";
String stnModel = null;
String posModel = null;
if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
// Sentence model.
stnModel = "./data/Model/en-sent.bin";
// POS model.
posModel = "./data/Model/en-pos-maxent.bin";
sentence = true;
}
String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s_sample.txt", Ngram, articleType);
// String fvFile = String.format("./data/Features/fv_%dgram_topicmodel.txt", Ngram);
String fvStatFile = String.format("./data/Features/fv_%dgram_stat_%s_%s.txt", Ngram, articleType, topicmodel);
String aspectList = "./data/Model/aspect_" + category + ".txt";
String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
String pathToPosWords = "./data/Model/SentiWordsPos.txt";
String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
String pathToNegationWords = "./data/Model/negation_words.txt";
String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
File rootFolder = new File("./data/results");
if (!rootFolder.exists()) {
System.out.println("creating root directory" + rootFolder);
rootFolder.mkdir();
}
Calendar today = Calendar.getInstance();
String filePrefix = String.format("./data/results/%s-%s-%s%s-%s", 1 + today.get(Calendar.MONTH), today.get(Calendar.DAY_OF_MONTH), today.get(Calendar.HOUR_OF_DAY), today.get(Calendar.MINUTE), topicmodel);
File resultFolder = new File(filePrefix);
if (!resultFolder.exists()) {
System.out.println("creating directory" + resultFolder);
resultFolder.mkdir();
}
String infoFilePath = filePrefix + "/Information.txt";
// //store top k words distribution over topic
String topWordPath = filePrefix + "/topWords.txt";
/**
***Parameters in feature selection.****
*/
String stopwords = "./data/Model/stopwords.dat";
// Feature selection method.
String featureSelection = "DF";
// Used in feature selection, the starting point of the features.
double startProb = 0.50;
// Used in feature selection, the ending point of
double endProb = 1;
// the features.
// Filter the features with DFs smaller than this
int maxDF = -1, minDF = 10;
// threshold.
double DFUpperThreshold = 0.05;
System.out.println("Performing feature selection, wait...");
ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
analyzer.LoadStopwords(stopwords);
// analyzer.LoadParentDirectory(articleFolder, suffix);
// analyzer.LoadChildDirectory(commentFolder, suffix);
analyzer.LoadDirectory(articleFolder, suffix);
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
// analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
// Select the features.
analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, maxDF, minDF);
System.out.println("Creating feature vectors, wait...");
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
// newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
/**
*** parent child topic model ****
*/
// analyzer.setFeatureValues(featureValue, norm);
// Get the collection of all the documents.
_Corpus c = analyzer.returnCorpus(fvStatFile);
}
Aggregations