use of Classifier.semisupervised.NaiveBayesEM in project IR_Base by Linda-sunshine.
the class AmazonReviewMain method main.
public static void main(String[] args) throws IOException, ParseException {
/**
***Set these parameters before run the classifiers.****
*/
// Define the number of classes
int classNumber = 5;
// The default value is bigram.
int Ngram = 2;
// Document length threshold
int lengthThreshold = 10;
// "TF", "TFIDF", "BM25", "PLN"
// The way of calculating the feature value, which can also be "TFIDF", "BM25"
String featureValue = "TF";
// The way of normalization.(only 1 and 2)
int norm = 0;
// k fold-cross validation
int CVFold = 10;
// "SUP", "SEMI", "FV", "ASPECT"
String style = "SUP";
// "NB", "LR", "SVM", "PR"
// Which classifier to use.
String classifier = "SVM";
// "GF", "NB-EM"
String model = "SVM";
double C = 1.0;
// String modelPath = "./data/Model/";
// "data/debug/LR.output";
String debugOutput = null;
System.out.println("--------------------------------------------------------------------------------------");
System.out.println("Parameters of this run:" + "\nClassNumber: " + classNumber + "\tNgram: " + Ngram + "\tFeatureValue: " + featureValue + "\tLearning Method: " + style + "\tClassifier: " + classifier + "\nCross validation: " + CVFold);
// /*****Parameters in feature selection.*****/
// Feature selection method.
String featureSelection = "CHI";
String stopwords = "./data/Model/stopwords.dat";
// Used in feature selection, the starting point of the features.
double startProb = 0.5;
// Used in feature selection, the ending point of the features.
double endProb = 0.999;
// Filter the features with DFs smaller than this threshold.
int maxDF = -1, minDF = 1;
// System.out.println("Feature Seleciton: " + featureSelection + "\tStarting probability: " + startProb + "\tEnding probability:" + endProb);
/**
***The parameters used in loading files.****
*/
String folder = "./data/amazon/tablet/small";
String suffix = ".json";
// Token model
String tokenModel = "./data/Model/en-token.bin";
String pattern = String.format("%dgram_%s", Ngram, featureSelection);
String fvFile = String.format("data/Features/fv_%s_small.txt", pattern);
String fvStatFile = String.format("data/Features/fv_stat_%s_small.txt", pattern);
String vctFile = String.format("data/Fvs/vct_%s_tablet_small.dat", pattern);
/**
***Parameters in time series analysis.****
*/
int window = 0;
System.out.println("Window length: " + window);
System.out.println("--------------------------------------------------------------------------------------");
// /****Loading json files*****/
DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
analyzer.LoadStopwords(stopwords);
// Load all the documents as the data set.
analyzer.LoadDirectory(folder, suffix);
// /****Feature selection*****/
System.out.println("Performing feature selection, wait...");
// Select the features.
analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, maxDF, minDF);
analyzer.SaveCVStat(fvStatFile);
/**
**create vectors for documents****
*/
System.out.println("Creating feature vectors, wait...");
// jsonAnalyzer
analyzer = new DocAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
// Just for debugging purpose: all the other classifiers do not need content
analyzer.setReleaseContent(!(classifier.equals("PR") || debugOutput != null));
// Load all the documents as the data set.
analyzer.LoadDirectory(folder, suffix);
analyzer.setFeatureValues(featureValue, norm);
// // analyzer.setTimeFeatures(window);
_Corpus corpus = analyzer.getCorpus();
// Execute different classifiers.
if (style.equals("SUP")) {
if (classifier.equals("NB")) {
// Define a new naive bayes with the parameters.
System.out.println("Start naive bayes, wait...");
NaiveBayes myNB = new NaiveBayes(corpus);
// Use the movie reviews for testing the codes.
myNB.crossValidation(CVFold, corpus);
} else if (classifier.equals("LR")) {
// Define a new logistics regression with the parameters.
System.out.println("Start logistic regression, wait...");
LogisticRegression myLR = new LogisticRegression(corpus, C);
myLR.setDebugOutput(debugOutput);
// Use the movie reviews for testing the codes.
myLR.crossValidation(CVFold, corpus);
// myLR.saveModel(modelPath + "LR.model");
} else if (classifier.equals("SVM")) {
System.out.println("Start SVM, wait...");
SVM mySVM = new SVM(corpus, C);
mySVM.crossValidation(CVFold, corpus);
} else if (classifier.equals("PR")) {
System.out.println("Start PageRank, wait...");
PageRank myPR = new PageRank(corpus, C, 100, 50, 1e-6);
myPR.train(corpus.getCollection());
} else
System.out.println("Classifier has not developed yet!");
} else if (style.equals("SEMI")) {
if (model.equals("GF")) {
System.out.println("Start Gaussian Field, wait...");
GaussianFields mySemi = new GaussianFields(corpus, classifier, C);
mySemi.crossValidation(CVFold, corpus);
} else if (model.equals("NB-EM")) {
// corpus.setUnlabeled();
System.out.println("Start Naive Bayes with EM, wait...");
NaiveBayesEM myNB = new NaiveBayesEM(corpus);
// Use the movie reviews for testing the codes.
myNB.crossValidation(CVFold, corpus);
}
} else if (style.equals("FV")) {
corpus.save2File(vctFile);
System.out.format("Vectors saved to %s...\n", vctFile);
} else
System.out.println("Learning paradigm has not developed yet!");
}
Aggregations