use of structures._Corpus in project IR_Base by Linda-sunshine.
the class TransductiveMain method main.
public static void main(String[] args) throws IOException, ParseException {
// Define the number of classes in this Naive Bayes.
int classNumber = 5;
// The default value is unigram.
int Ngram = 2;
// Document length threshold
int lengthThreshold = 5;
// each sentence should have at least 2 sentences for HTSM, LRSHTM
int minimunNumberofSentence = 2;
/**
***parameters for the two-topic topic model****
*/
// pLSA, LDA_Gibbs, LDA_Variational
String topicmodel = "pLSA";
int number_of_topics = 30;
// these two parameters must be larger than 1!!!
double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = 5.0;
// negative converge means do need to check likelihood convergency
double converge = -1, lambda = 0.7;
int number_of_iteration = 100;
boolean aspectSentiPrior = true;
/**
***The parameters used in loading files.****
*/
String folder = "./data/amazon/tablet/topicmodel";
String suffix = ".json";
String stopword = "./data/Model/stopwords.dat";
// Token model.
String tokenModel = "./data/Model/en-token.bin";
// Sentence model. Need it for pos tagging.
String stnModel = "./data/Model/en-sent.bin";
String tagModel = "./data/Model/en-pos-maxent.bin";
String sentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
// Added by Mustafizur----------------
String pathToPosWords = "./data/Model/SentiWordsPos.txt";
String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
String pathToNegationWords = "./data/Model/negation_words.txt";
// String category = "tablets"; //"electronics"
// String dataSize = "86jsons"; //"50K", "100K"
// String fvFile = String.format("./data/Features/fv_%dgram_%s_%s.txt", Ngram, category, dataSize);
// String fvStatFile = String.format("./data/Features/fv_%dgram_stat_%s_%s.txt", Ngram, category, dataSize);
// String aspectlist = "./data/Model/aspect_output_simple.txt";
String fvFile = String.format("./data/Features/fv_%dgram_topicmodel.txt", Ngram);
String fvStatFile = String.format("./data/Features/fv_%dgram_stat_topicmodel.txt", Ngram);
String aspectSentiList = "./data/Model/aspect_sentiment_tablet.txt";
String aspectList = "./data/Model/aspect_tablet.txt";
/**
***Parameters in learning style.****
*/
// "SUP", "SEMI"
String style = "SEMI";
// "RW", "RW-ML", "RW-L2R"
String method = "RW-L2R";
/**
***Parameters in transductive learning.****
*/
String debugOutput = "data/debug/topical.sim";
// String debugOutput = null;
boolean releaseContent = false;
// k fold-cross validation
int CVFold = 10;
// choice of base learner
String multipleLearner = "SVM";
// trade-off parameter
double C = 1.0;
/**
***Parameters in feature selection.****
*/
// String featureSelection = "DF"; //Feature selection method.
// double startProb = 0.5; // Used in feature selection, the starting point of the features.
// double endProb = 0.999; // Used in feature selection, the ending point of the features.
// int DFthreshold = 30; // Filter the features with DFs smaller than this threshold.
//
// System.out.println("Performing feature selection, wait...");
// jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
// analyzer.LoadStopwords(stopwords);
// analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
// analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
System.out.println("Creating feature vectors, wait...");
AspectAnalyzer analyzer = new AspectAnalyzer(tokenModel, stnModel, tagModel, classNumber, fvFile, Ngram, lengthThreshold, aspectList, true);
// Added by Mustafizur----------------
analyzer.setMinimumNumberOfSentences(minimunNumberofSentence);
// Load the sentiwordnet file.
analyzer.LoadStopwords(stopword);
// analyzer.loadPriorPosNegWords(sentiWordNet, pathToPosWords, pathToNegWords, pathToNegationWords);
analyzer.setReleaseContent(releaseContent);
// Added by Mustafizur----------------
// Load all the documents as the data set.
analyzer.LoadDirectory(folder, suffix);
analyzer.setFeatureValues("TF", 0);
// Get the collection of all the documents.
_Corpus c = analyzer.returnCorpus(fvStatFile);
pLSA tModel = null;
if (topicmodel.equals("pLSA")) {
tModel = new pLSA_multithread(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha);
} else if (topicmodel.equals("LDA_Gibbs")) {
tModel = new LDA_Gibbs(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha, 0.4, 50);
} else if (topicmodel.equals("LDA_Variational")) {
tModel = new LDA_Variational_multithread(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha, 10, -1);
} else {
System.out.println("The selected topic model has not developed yet!");
return;
}
tModel.setDisplayLap(0);
tModel.setSentiAspectPrior(aspectSentiPrior);
tModel.LoadPrior(aspectSentiPrior ? aspectSentiList : aspectList, eta);
tModel.EMonCorpus();
// construct effective feature values for supervised classifiers
analyzer.setFeatureValues("BM25", 2);
// how to set this reasonably
c.mapLabels(3);
if (style.equals("SEMI")) {
// perform transductive learning
System.out.println("Start Transductive Learning, wait...");
double learningRatio = 1.0;
// k nearest labeled, k' nearest unlabeled
int k = 30, kPrime = 20;
// labeled data weight, unlabeled data weight
double tAlpha = 1.0, tBeta = 0.1;
// convergence of random walk, weight of random walk
double tDelta = 1e-5, tEta = 0.6;
boolean simFlag = false, weightedAvg = true;
// bound for generating rating constraints (must be zero in binary case)
int bound = 0;
// top K similar documents for constructing pairwise ranking targets
int topK = 25;
double noiseRatio = 1.0;
boolean metricLearning = true;
// training LambdaRank with multi-threads
boolean multithread_LR = true;
GaussianFieldsByRandomWalk mySemi = null;
if (method.equals("RW")) {
mySemi = new GaussianFieldsByRandomWalk(c, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, weightedAvg);
} else if (method.equals("RW-ML")) {
mySemi = new LinearSVMMetricLearning(c, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, weightedAvg, bound);
((LinearSVMMetricLearning) mySemi).setMetricLearningMethod(metricLearning);
} else if (method.equals("RW-L2R")) {
mySemi = new L2RMetricLearning(c, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, weightedAvg, topK, noiseRatio, multithread_LR);
}
mySemi.setSimilarity(simFlag);
mySemi.setDebugOutput(debugOutput);
mySemi.crossValidation(CVFold, c);
} else if (style.equals("SUP")) {
// perform supervised learning
System.out.println("Start SVM, wait...");
SVM mySVM = new SVM(c, C);
mySVM.crossValidation(CVFold, c);
}
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class AmazonReviewMain method main.
public static void main(String[] args) throws IOException, ParseException {
/**
***Set these parameters before run the classifiers.****
*/
// Define the number of classes
int classNumber = 5;
// The default value is bigram.
int Ngram = 2;
// Document length threshold
int lengthThreshold = 10;
// "TF", "TFIDF", "BM25", "PLN"
// The way of calculating the feature value, which can also be "TFIDF", "BM25"
String featureValue = "TF";
// The way of normalization.(only 1 and 2)
int norm = 0;
// k fold-cross validation
int CVFold = 10;
// "SUP", "SEMI", "FV", "ASPECT"
String style = "SUP";
// "NB", "LR", "SVM", "PR"
// Which classifier to use.
String classifier = "SVM";
// "GF", "NB-EM"
String model = "SVM";
double C = 1.0;
// String modelPath = "./data/Model/";
// "data/debug/LR.output";
String debugOutput = null;
System.out.println("--------------------------------------------------------------------------------------");
System.out.println("Parameters of this run:" + "\nClassNumber: " + classNumber + "\tNgram: " + Ngram + "\tFeatureValue: " + featureValue + "\tLearning Method: " + style + "\tClassifier: " + classifier + "\nCross validation: " + CVFold);
// /*****Parameters in feature selection.*****/
// Feature selection method.
String featureSelection = "CHI";
String stopwords = "./data/Model/stopwords.dat";
// Used in feature selection, the starting point of the features.
double startProb = 0.5;
// Used in feature selection, the ending point of the features.
double endProb = 0.999;
// Filter the features with DFs smaller than this threshold.
int maxDF = -1, minDF = 1;
// System.out.println("Feature Seleciton: " + featureSelection + "\tStarting probability: " + startProb + "\tEnding probability:" + endProb);
/**
***The parameters used in loading files.****
*/
String folder = "./data/amazon/tablet/small";
String suffix = ".json";
// Token model
String tokenModel = "./data/Model/en-token.bin";
String pattern = String.format("%dgram_%s", Ngram, featureSelection);
String fvFile = String.format("data/Features/fv_%s_small.txt", pattern);
String fvStatFile = String.format("data/Features/fv_stat_%s_small.txt", pattern);
String vctFile = String.format("data/Fvs/vct_%s_tablet_small.dat", pattern);
/**
***Parameters in time series analysis.****
*/
int window = 0;
System.out.println("Window length: " + window);
System.out.println("--------------------------------------------------------------------------------------");
// /****Loading json files*****/
DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
analyzer.LoadStopwords(stopwords);
// Load all the documents as the data set.
analyzer.LoadDirectory(folder, suffix);
// /****Feature selection*****/
System.out.println("Performing feature selection, wait...");
// Select the features.
analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, maxDF, minDF);
analyzer.SaveCVStat(fvStatFile);
/**
**create vectors for documents****
*/
System.out.println("Creating feature vectors, wait...");
// jsonAnalyzer
analyzer = new DocAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
// Just for debugging purpose: all the other classifiers do not need content
analyzer.setReleaseContent(!(classifier.equals("PR") || debugOutput != null));
// Load all the documents as the data set.
analyzer.LoadDirectory(folder, suffix);
analyzer.setFeatureValues(featureValue, norm);
// // analyzer.setTimeFeatures(window);
_Corpus corpus = analyzer.getCorpus();
// Execute different classifiers.
if (style.equals("SUP")) {
if (classifier.equals("NB")) {
// Define a new naive bayes with the parameters.
System.out.println("Start naive bayes, wait...");
NaiveBayes myNB = new NaiveBayes(corpus);
// Use the movie reviews for testing the codes.
myNB.crossValidation(CVFold, corpus);
} else if (classifier.equals("LR")) {
// Define a new logistics regression with the parameters.
System.out.println("Start logistic regression, wait...");
LogisticRegression myLR = new LogisticRegression(corpus, C);
myLR.setDebugOutput(debugOutput);
// Use the movie reviews for testing the codes.
myLR.crossValidation(CVFold, corpus);
// myLR.saveModel(modelPath + "LR.model");
} else if (classifier.equals("SVM")) {
System.out.println("Start SVM, wait...");
SVM mySVM = new SVM(corpus, C);
mySVM.crossValidation(CVFold, corpus);
} else if (classifier.equals("PR")) {
System.out.println("Start PageRank, wait...");
PageRank myPR = new PageRank(corpus, C, 100, 50, 1e-6);
myPR.train(corpus.getCollection());
} else
System.out.println("Classifier has not developed yet!");
} else if (style.equals("SEMI")) {
if (model.equals("GF")) {
System.out.println("Start Gaussian Field, wait...");
GaussianFields mySemi = new GaussianFields(corpus, classifier, C);
mySemi.crossValidation(CVFold, corpus);
} else if (model.equals("NB-EM")) {
// corpus.setUnlabeled();
System.out.println("Start Naive Bayes with EM, wait...");
NaiveBayesEM myNB = new NaiveBayesEM(corpus);
// Use the movie reviews for testing the codes.
myNB.crossValidation(CVFold, corpus);
}
} else if (style.equals("FV")) {
corpus.save2File(vctFile);
System.out.format("Vectors saved to %s...\n", vctFile);
} else
System.out.println("Learning paradigm has not developed yet!");
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class Execution method main.
public static void main(String[] args) throws IOException, ParseException {
Parameter param = new Parameter(args);
System.out.println(param.toString());
String stnModel = (param.m_model.equals("HTMM") || param.m_model.equals("LRHTMM")) ? param.m_stnModel : null;
String posModel = (param.m_model.equals("HTMM") || param.m_model.equals("LRHTMM")) ? param.m_posModel : null;
_Corpus corpus;
Analyzer analyzer;
/**
*Load the data from vector file**
*/
if (param.m_fvFile != null && (new File(param.m_fvFile)).exists()) {
analyzer = new VctAnalyzer(param.m_classNumber, param.m_lengthThreshold, param.m_featureFile);
// Load all the documents as the data set.
analyzer.LoadDoc(param.m_fvFile);
corpus = analyzer.getCorpus();
} else {
/**
*Load the data from text file**
*/
analyzer = new DocAnalyzer(param.m_tokenModel, stnModel, posModel, param.m_classNumber, param.m_featureFile, param.m_Ngram, param.m_lengthThreshold);
((DocAnalyzer) analyzer).setReleaseContent(!param.m_weightScheme.equals("PR"));
if (param.m_featureFile == null) {
/**
**Pre-process the data.****
*/
// Feture selection.
System.out.println("Performing feature selection, wait...");
param.m_featureFile = String.format("./data/Features/%s_fv.dat", param.m_featureSelection);
param.m_featureStat = String.format("./data/Features/%s_fv_stat.dat", param.m_featureSelection);
System.out.println(param.printFeatureSelectionConfiguration());
((DocAnalyzer) analyzer).LoadStopwords(param.m_stopwords);
// Load all the documents as the data set.
analyzer.LoadDirectory(param.m_folder, param.m_suffix);
// Select the features.
analyzer.featureSelection(param.m_featureFile, param.m_featureSelection, param.m_startProb, param.m_endProb, param.m_maxDF, param.m_minDF);
}
// Collect vectors for documents.
System.out.println("Creating feature vectors, wait...");
// Load all the documents as the data set.
analyzer.LoadDirectory(param.m_folder, param.m_suffix);
analyzer.setFeatureValues(param.m_featureValue, param.m_norm);
corpus = analyzer.returnCorpus(param.m_featureStat);
}
if (param.m_weightScheme.equals("PR")) {
System.out.println("Creating PageRank instance weighting, wait...");
PageRank myPR = new PageRank(corpus, param.m_C, 100, 50, 1e-6);
myPR.train(corpus.getCollection());
}
// Execute different classifiers.
if (param.m_style.equals("SUP")) {
BaseClassifier model = null;
if (param.m_model.equals("NB")) {
// Define a new naive bayes with the parameters.
System.out.println("Start naive bayes, wait...");
model = new NaiveBayes(corpus);
} else if (param.m_model.equals("LR")) {
// Define a new logistics regression with the parameters.
System.out.println("Start logistic regression, wait...");
model = new LogisticRegression(corpus, param.m_C);
} else if (param.m_model.equals("PR-LR")) {
// Define a new logistics regression with the parameters.
System.out.println("Start posterior regularized logistic regression, wait...");
model = new PRLogisticRegression(corpus, param.m_C);
} else if (param.m_model.equals("SVM")) {
// corpus.save2File("data/FVs/fvector.dat");
System.out.println("Start SVM, wait...");
model = new SVM(corpus, param.m_C);
} else {
System.out.println("Classifier has not been developed yet!");
System.exit(-1);
}
model.setDebugOutput(param.m_debugOutput);
model.crossValidation(param.m_CVFold, corpus);
} else if (param.m_style.equals("SEMI")) {
BaseClassifier model = null;
if (param.m_model.equals("GF")) {
System.out.println("Start Gaussian Field by matrix inversion, wait...");
model = new GaussianFields(corpus, param.m_classifier, param.m_C, param.m_sampleRate, param.m_kUL, param.m_kUU);
} else if (param.m_model.equals("GF-RW")) {
System.out.println("Start Gaussian Field by random walk, wait...");
model = new GaussianFieldsByRandomWalk(corpus, param.m_classifier, param.m_C, param.m_sampleRate, param.m_kUL, param.m_kUU, param.m_alpha, param.m_beta, param.m_converge, param.m_eta, param.m_weightedAvg);
} else if (param.m_model.equals("GF-RW-ML")) {
System.out.println("Start Gaussian Field with distance metric learning by random walk, wait...");
model = new LinearSVMMetricLearning(corpus, param.m_classifier, param.m_C, param.m_sampleRate, param.m_kUL, param.m_kUU, param.m_alpha, param.m_beta, param.m_converge, param.m_eta, param.m_weightedAvg, param.m_bound);
// ((LinearSVMMetricLearning)model).setMetricLearningMethod(false);
// ((LinearSVMMetricLearning)model).verification(param.m_CVFold, corpus, param.m_debugOutput);
} else {
System.out.println("Classifier has not been developed yet!");
System.exit(-1);
}
model.setDebugOutput(param.m_debugOutput);
model.crossValidation(param.m_CVFold, corpus);
} else if (param.m_style.equals("TM")) {
TopicModel model = null;
if (param.m_model.equals("2topic")) {
model = new twoTopic(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda);
} else if (param.m_model.equals("pLSA")) {
if (param.m_multithread == false) {
model = new pLSA(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha);
} else {
model = new pLSA_multithread(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha);
}
((pLSA) model).LoadPrior(param.m_priorFile, param.m_gamma);
} else if (param.m_model.equals("vLDA")) {
if (param.m_multithread == false) {
model = new LDA_Variational(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha, param.m_maxVarIterations, param.m_varConverge);
} else {
model = new LDA_Variational_multithread(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha, param.m_maxVarIterations, param.m_varConverge);
}
((LDA_Variational) model).LoadPrior(param.m_priorFile, param.m_gamma);
} else if (param.m_model.equals("gLDA")) {
model = new LDA_Gibbs(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha, param.m_burnIn, param.m_lag);
((LDA_Gibbs) model).LoadPrior(param.m_priorFile, param.m_gamma);
} else if (param.m_model.equals("HTMM")) {
model = new HTMM(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_numTopics, param.m_alpha);
} else if (param.m_model.equals("LRHTMM")) {
model = new LRHTMM(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_numTopics, param.m_alpha, param.m_C);
} else {
System.out.println("The specified topic model has not been developed yet!");
System.exit(-1);
}
if (param.m_CVFold <= 1) {
model.EMonCorpus();
// fixed: print top 10 words
model.printTopWords(10);
} else
model.crossValidation(param.m_CVFold);
} else if (param.m_style.equals("FV")) {
corpus.save2File(param.m_fvFile);
System.out.format("Vectors saved to %s...\n", param.m_fvFile);
} else
System.out.println("Learning paradigm has not developed yet!");
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class MovieReviewMain method main.
/**
***************************Main function******************************
*/
public static void main(String[] args) throws IOException {
_Corpus corpus = new _Corpus();
/**
***Set these parameters before run the classifiers.****
*/
// Initialize the fetureSize to be zero at first.
int featureSize = 0;
// Define the number of classes in this Naive Bayes.
int classNumber = 2;
// The default value is unigram.
int Ngram = 1;
// Document length threshold
int lengthThreshold = 5;
// The way of calculating the feature value, which can also be "TFIDF", "BM25"
String featureValue = "TF";
int norm = 1;
// Which classifier to use.
String classifier = "SVM";
System.out.println("--------------------------------------------------------------------------------------");
System.out.println("Parameters of this run:" + "\nClassNumber: " + classNumber + "\tNgram: " + Ngram + "\tFeatureValue: " + featureValue + "\tClassifier: " + classifier);
/**
***The parameters used in loading files.****
*/
String folder = "data/txt_sentoken";
String suffix = ".txt";
// Token model.
String tokenModel = "./data/Model/en-token.bin";
// String finalLocation = "/Users/lingong/Documents/Lin'sWorkSpace/IR_Base/data/movie/FinalFeatureStat.txt"; //The destination of storing the final features with stats.
// String featureLocation = "/Users/lingong/Documents/Lin'sWorkSpace/IR_Base/data/movie/SelectedFeatures.txt";
String finalLocation = "/home/lin/Lin'sWorkSpace/IR_Base/FinalFeatureStat.txt";
String featureLocation = "/home/lin/Lin'sWorkSpace/IR_Base/SelectedFeatures.txt";
/**
***Paramters in feature selection.****
*/
// String providedCV = "";
String featureSelection = "";
// Provided CV.
String providedCV = "Features.txt";
// String featureSelection = "MI"; //Feature selection method.
// Used in feature selection, the starting point of the features.
double startProb = 0.5;
// Used in feature selection, the ending point of the features.
double endProb = 1;
// Filter the features with DFs smaller than this threshold.
int maxDF = -1, minDF = 5;
System.out.println("Feature Seleciton: " + featureSelection + "\tStarting probability: " + startProb + "\tEnding probability:" + endProb);
System.out.println("--------------------------------------------------------------------------------------");
if (providedCV.isEmpty() && featureSelection.isEmpty()) {
// Case 1: no provided CV, no feature selection.
System.out.println("Case 1: no provided CV, no feature selection. Start loading files, wait...");
DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
// Load all the documents as the data set.
analyzer.LoadDirectory(folder, suffix);
analyzer.setFeatureValues(featureValue, norm);
corpus = analyzer.returnCorpus(finalLocation);
} else if (!providedCV.isEmpty() && featureSelection.isEmpty()) {
// Case 2: provided CV, no feature selection.
System.out.println("Case 2: provided CV, no feature selection. Start loading files, wait...");
DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, providedCV, Ngram, lengthThreshold);
// Load all the documents as the data set.
analyzer.LoadDirectory(folder, suffix);
analyzer.setFeatureValues(featureValue, norm);
corpus = analyzer.returnCorpus(finalLocation);
} else if (providedCV.isEmpty() && !featureSelection.isEmpty()) {
// Case 3: no provided CV, feature selection.
System.out.println("Case 3: no provided CV, feature selection. Start loading files to do feature selection, wait...");
DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
// Load all the documents as the data set.
analyzer.LoadDirectory(folder, suffix);
// Select the features.
analyzer.featureSelection(featureLocation, featureSelection, startProb, endProb, maxDF, minDF);
System.out.println("Start loading files, wait...");
analyzer = new DocAnalyzer(tokenModel, classNumber, featureLocation, Ngram, lengthThreshold);
analyzer.LoadDirectory(folder, suffix);
analyzer.setFeatureValues(featureValue, norm);
corpus = analyzer.returnCorpus(finalLocation);
} else if (!providedCV.isEmpty() && !featureSelection.isEmpty()) {
// Case 4: provided CV, feature selection.
DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, providedCV, Ngram, lengthThreshold);
System.out.println("Case 4: provided CV, feature selection. Start loading files to do feature selection, wait...");
// Load all the documents as the data set.
analyzer.LoadDirectory(folder, suffix);
// Select the features.
analyzer.featureSelection(featureLocation, featureSelection, startProb, endProb, maxDF, minDF);
System.out.println("Start loading files, wait...");
analyzer = new DocAnalyzer(tokenModel, classNumber, featureLocation, Ngram, lengthThreshold);
analyzer.LoadDirectory(folder, suffix);
analyzer.setFeatureValues(featureValue, norm);
corpus = analyzer.returnCorpus(finalLocation);
} else
System.out.println("The setting fails, please check the parameters!!");
// Execute different classifiers.
if (classifier.equals("NB")) {
// Define a new naive bayes with the parameters.
System.out.println("Start naive bayes, wait...");
NaiveBayes myNB = new NaiveBayes(corpus);
// Use the movie reviews for testing the codes.
myNB.crossValidation(10, corpus);
} else if (classifier.equals("LR")) {
// Define a new lambda.
double lambda = 0;
// Define a new logistics regression with the parameters.
System.out.println("Start logistic regression, wait...");
LogisticRegression myLR = new LogisticRegression(corpus, lambda);
// Use the movie reviews for testing the codes.
myLR.crossValidation(10, corpus);
} else if (classifier.equals("SVM")) {
// corpus.save2File("data/FVs/fvector.dat");
// The default value is 1.
double C = 3;
// default value from Lin's implementation
double eps = 0.01;
System.out.println("Start SVM, wait...");
SVM mySVM = new SVM(corpus, C);
mySVM.crossValidation(10, corpus);
} else
System.out.println("Have not developed yet!:(");
}
use of structures._Corpus in project IR_Base by Linda-sunshine.
the class BaseClassifier method crossValidation.
// k-fold Cross Validation.
public void crossValidation(int k, _Corpus c) {
try {
if (m_debugOutput != null) {
m_debugWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(m_debugOutput, false), "UTF-8"));
m_debugWriter.write(this.toString() + "\n");
}
c.shuffle(k);
int[] masks = c.getMasks();
ArrayList<_Doc> docs = c.getCollection();
// Use this loop to iterate all the ten folders, set the train set and test set.
for (int i = 0; i < k; i++) {
for (int j = 0; j < masks.length; j++) {
// more for testing
if (// || masks[j]==(i+3)%k
masks[j] == (i + 1) % k || masks[j] == (i + 2) % k)
m_trainSet.add(docs.get(j));
else
m_testSet.add(docs.get(j));
// //more for training
// if(masks[j]==i)
// m_testSet.add(docs.get(j));
// else
// m_trainSet.add(docs.get(j));
}
long start = System.currentTimeMillis();
train();
double accuracy = test();
System.out.format("%s Train/Test finished in %.2f seconds with accuracy %.4f and F1 (%s)...\n", this.toString(), (System.currentTimeMillis() - start) / 1000.0, accuracy, getF1String());
m_trainSet.clear();
m_testSet.clear();
}
calculateMeanVariance(m_precisionsRecalls);
if (m_debugOutput != null)
m_debugWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
Aggregations