Search in sources :

Example 1 with NaiveBayesEM

use of Classifier.semisupervised.NaiveBayesEM in project IR_Base by Linda-sunshine.

the class AmazonReviewMain method main.

public static void main(String[] args) throws IOException, ParseException {
    /**
     ***Set these parameters before run the classifiers.****
     */
    // Define the number of classes
    int classNumber = 5;
    // The default value is bigram.
    int Ngram = 2;
    // Document length threshold
    int lengthThreshold = 10;
    // "TF", "TFIDF", "BM25", "PLN"
    // The way of calculating the feature value, which can also be "TFIDF", "BM25"
    String featureValue = "TF";
    // The way of normalization.(only 1 and 2)
    int norm = 0;
    // k fold-cross validation
    int CVFold = 10;
    // "SUP", "SEMI", "FV", "ASPECT"
    String style = "SUP";
    // "NB", "LR", "SVM", "PR"
    // Which classifier to use.
    String classifier = "SVM";
    // "GF", "NB-EM"
    String model = "SVM";
    double C = 1.0;
    // String modelPath = "./data/Model/";
    // "data/debug/LR.output";
    String debugOutput = null;
    System.out.println("--------------------------------------------------------------------------------------");
    System.out.println("Parameters of this run:" + "\nClassNumber: " + classNumber + "\tNgram: " + Ngram + "\tFeatureValue: " + featureValue + "\tLearning Method: " + style + "\tClassifier: " + classifier + "\nCross validation: " + CVFold);
    // /*****Parameters in feature selection.*****/
    // Feature selection method.
    String featureSelection = "CHI";
    String stopwords = "./data/Model/stopwords.dat";
    // Used in feature selection, the starting point of the features.
    double startProb = 0.5;
    // Used in feature selection, the ending point of the features.
    double endProb = 0.999;
    // Filter the features with DFs smaller than this threshold.
    int maxDF = -1, minDF = 1;
    // System.out.println("Feature Seleciton: " + featureSelection + "\tStarting probability: " + startProb + "\tEnding probability:" + endProb);
    /**
     ***The parameters used in loading files.****
     */
    String folder = "./data/amazon/tablet/small";
    String suffix = ".json";
    // Token model
    String tokenModel = "./data/Model/en-token.bin";
    String pattern = String.format("%dgram_%s", Ngram, featureSelection);
    String fvFile = String.format("data/Features/fv_%s_small.txt", pattern);
    String fvStatFile = String.format("data/Features/fv_stat_%s_small.txt", pattern);
    String vctFile = String.format("data/Fvs/vct_%s_tablet_small.dat", pattern);
    /**
     ***Parameters in time series analysis.****
     */
    int window = 0;
    System.out.println("Window length: " + window);
    System.out.println("--------------------------------------------------------------------------------------");
    // /****Loading json files*****/
    DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
    analyzer.LoadStopwords(stopwords);
    // Load all the documents as the data set.
    analyzer.LoadDirectory(folder, suffix);
    // /****Feature selection*****/
    System.out.println("Performing feature selection, wait...");
    // Select the features.
    analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, maxDF, minDF);
    analyzer.SaveCVStat(fvStatFile);
    /**
     **create vectors for documents****
     */
    System.out.println("Creating feature vectors, wait...");
    // jsonAnalyzer
    analyzer = new DocAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
    // Just for debugging purpose: all the other classifiers do not need content
    analyzer.setReleaseContent(!(classifier.equals("PR") || debugOutput != null));
    // Load all the documents as the data set.
    analyzer.LoadDirectory(folder, suffix);
    analyzer.setFeatureValues(featureValue, norm);
    // //		analyzer.setTimeFeatures(window);
    _Corpus corpus = analyzer.getCorpus();
    // Execute different classifiers.
    if (style.equals("SUP")) {
        if (classifier.equals("NB")) {
            // Define a new naive bayes with the parameters.
            System.out.println("Start naive bayes, wait...");
            NaiveBayes myNB = new NaiveBayes(corpus);
            // Use the movie reviews for testing the codes.
            myNB.crossValidation(CVFold, corpus);
        } else if (classifier.equals("LR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start logistic regression, wait...");
            LogisticRegression myLR = new LogisticRegression(corpus, C);
            myLR.setDebugOutput(debugOutput);
            // Use the movie reviews for testing the codes.
            myLR.crossValidation(CVFold, corpus);
        // myLR.saveModel(modelPath + "LR.model");
        } else if (classifier.equals("SVM")) {
            System.out.println("Start SVM, wait...");
            SVM mySVM = new SVM(corpus, C);
            mySVM.crossValidation(CVFold, corpus);
        } else if (classifier.equals("PR")) {
            System.out.println("Start PageRank, wait...");
            PageRank myPR = new PageRank(corpus, C, 100, 50, 1e-6);
            myPR.train(corpus.getCollection());
        } else
            System.out.println("Classifier has not developed yet!");
    } else if (style.equals("SEMI")) {
        if (model.equals("GF")) {
            System.out.println("Start Gaussian Field, wait...");
            GaussianFields mySemi = new GaussianFields(corpus, classifier, C);
            mySemi.crossValidation(CVFold, corpus);
        } else if (model.equals("NB-EM")) {
            // corpus.setUnlabeled();
            System.out.println("Start Naive Bayes with EM, wait...");
            NaiveBayesEM myNB = new NaiveBayesEM(corpus);
            // Use the movie reviews for testing the codes.
            myNB.crossValidation(CVFold, corpus);
        }
    } else if (style.equals("FV")) {
        corpus.save2File(vctFile);
        System.out.format("Vectors saved to %s...\n", vctFile);
    } else
        System.out.println("Learning paradigm has not developed yet!");
}
Also used : structures._Corpus(structures._Corpus) NaiveBayes(Classifier.supervised.NaiveBayes) DocAnalyzer(Analyzer.DocAnalyzer) PageRank(influence.PageRank) NaiveBayesEM(Classifier.semisupervised.NaiveBayesEM) GaussianFields(Classifier.semisupervised.GaussianFields) SVM(Classifier.supervised.SVM) LogisticRegression(Classifier.supervised.LogisticRegression)

Aggregations

DocAnalyzer (Analyzer.DocAnalyzer)1 GaussianFields (Classifier.semisupervised.GaussianFields)1 NaiveBayesEM (Classifier.semisupervised.NaiveBayesEM)1 LogisticRegression (Classifier.supervised.LogisticRegression)1 NaiveBayes (Classifier.supervised.NaiveBayes)1 SVM (Classifier.supervised.SVM)1 PageRank (influence.PageRank)1 structures._Corpus (structures._Corpus)1