Search in sources :

Example 1 with VctAnalyzer

use of Analyzer.VctAnalyzer in project IR_Base by Linda-sunshine.

the class Execution method main.

public static void main(String[] args) throws IOException, ParseException {
    Parameter param = new Parameter(args);
    System.out.println(param.toString());
    String stnModel = (param.m_model.equals("HTMM") || param.m_model.equals("LRHTMM")) ? param.m_stnModel : null;
    String posModel = (param.m_model.equals("HTMM") || param.m_model.equals("LRHTMM")) ? param.m_posModel : null;
    _Corpus corpus;
    Analyzer analyzer;
    /**
     *Load the data from vector file**
     */
    if (param.m_fvFile != null && (new File(param.m_fvFile)).exists()) {
        analyzer = new VctAnalyzer(param.m_classNumber, param.m_lengthThreshold, param.m_featureFile);
        // Load all the documents as the data set.
        analyzer.LoadDoc(param.m_fvFile);
        corpus = analyzer.getCorpus();
    } else {
        /**
         *Load the data from text file**
         */
        analyzer = new DocAnalyzer(param.m_tokenModel, stnModel, posModel, param.m_classNumber, param.m_featureFile, param.m_Ngram, param.m_lengthThreshold);
        ((DocAnalyzer) analyzer).setReleaseContent(!param.m_weightScheme.equals("PR"));
        if (param.m_featureFile == null) {
            /**
             **Pre-process the data.****
             */
            // Feture selection.
            System.out.println("Performing feature selection, wait...");
            param.m_featureFile = String.format("./data/Features/%s_fv.dat", param.m_featureSelection);
            param.m_featureStat = String.format("./data/Features/%s_fv_stat.dat", param.m_featureSelection);
            System.out.println(param.printFeatureSelectionConfiguration());
            ((DocAnalyzer) analyzer).LoadStopwords(param.m_stopwords);
            // Load all the documents as the data set.
            analyzer.LoadDirectory(param.m_folder, param.m_suffix);
            // Select the features.
            analyzer.featureSelection(param.m_featureFile, param.m_featureSelection, param.m_startProb, param.m_endProb, param.m_maxDF, param.m_minDF);
        }
        // Collect vectors for documents.
        System.out.println("Creating feature vectors, wait...");
        // Load all the documents as the data set.
        analyzer.LoadDirectory(param.m_folder, param.m_suffix);
        analyzer.setFeatureValues(param.m_featureValue, param.m_norm);
        corpus = analyzer.returnCorpus(param.m_featureStat);
    }
    if (param.m_weightScheme.equals("PR")) {
        System.out.println("Creating PageRank instance weighting, wait...");
        PageRank myPR = new PageRank(corpus, param.m_C, 100, 50, 1e-6);
        myPR.train(corpus.getCollection());
    }
    // Execute different classifiers.
    if (param.m_style.equals("SUP")) {
        BaseClassifier model = null;
        if (param.m_model.equals("NB")) {
            // Define a new naive bayes with the parameters.
            System.out.println("Start naive bayes, wait...");
            model = new NaiveBayes(corpus);
        } else if (param.m_model.equals("LR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start logistic regression, wait...");
            model = new LogisticRegression(corpus, param.m_C);
        } else if (param.m_model.equals("PR-LR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start posterior regularized logistic regression, wait...");
            model = new PRLogisticRegression(corpus, param.m_C);
        } else if (param.m_model.equals("SVM")) {
            // corpus.save2File("data/FVs/fvector.dat");
            System.out.println("Start SVM, wait...");
            model = new SVM(corpus, param.m_C);
        } else {
            System.out.println("Classifier has not been developed yet!");
            System.exit(-1);
        }
        model.setDebugOutput(param.m_debugOutput);
        model.crossValidation(param.m_CVFold, corpus);
    } else if (param.m_style.equals("SEMI")) {
        BaseClassifier model = null;
        if (param.m_model.equals("GF")) {
            System.out.println("Start Gaussian Field by matrix inversion, wait...");
            model = new GaussianFields(corpus, param.m_classifier, param.m_C, param.m_sampleRate, param.m_kUL, param.m_kUU);
        } else if (param.m_model.equals("GF-RW")) {
            System.out.println("Start Gaussian Field by random walk, wait...");
            model = new GaussianFieldsByRandomWalk(corpus, param.m_classifier, param.m_C, param.m_sampleRate, param.m_kUL, param.m_kUU, param.m_alpha, param.m_beta, param.m_converge, param.m_eta, param.m_weightedAvg);
        } else if (param.m_model.equals("GF-RW-ML")) {
            System.out.println("Start Gaussian Field with distance metric learning by random walk, wait...");
            model = new LinearSVMMetricLearning(corpus, param.m_classifier, param.m_C, param.m_sampleRate, param.m_kUL, param.m_kUU, param.m_alpha, param.m_beta, param.m_converge, param.m_eta, param.m_weightedAvg, param.m_bound);
        // ((LinearSVMMetricLearning)model).setMetricLearningMethod(false);
        // ((LinearSVMMetricLearning)model).verification(param.m_CVFold, corpus, param.m_debugOutput);
        } else {
            System.out.println("Classifier has not been developed yet!");
            System.exit(-1);
        }
        model.setDebugOutput(param.m_debugOutput);
        model.crossValidation(param.m_CVFold, corpus);
    } else if (param.m_style.equals("TM")) {
        TopicModel model = null;
        if (param.m_model.equals("2topic")) {
            model = new twoTopic(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda);
        } else if (param.m_model.equals("pLSA")) {
            if (param.m_multithread == false) {
                model = new pLSA(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha);
            } else {
                model = new pLSA_multithread(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha);
            }
            ((pLSA) model).LoadPrior(param.m_priorFile, param.m_gamma);
        } else if (param.m_model.equals("vLDA")) {
            if (param.m_multithread == false) {
                model = new LDA_Variational(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha, param.m_maxVarIterations, param.m_varConverge);
            } else {
                model = new LDA_Variational_multithread(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha, param.m_maxVarIterations, param.m_varConverge);
            }
            ((LDA_Variational) model).LoadPrior(param.m_priorFile, param.m_gamma);
        } else if (param.m_model.equals("gLDA")) {
            model = new LDA_Gibbs(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha, param.m_burnIn, param.m_lag);
            ((LDA_Gibbs) model).LoadPrior(param.m_priorFile, param.m_gamma);
        } else if (param.m_model.equals("HTMM")) {
            model = new HTMM(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_numTopics, param.m_alpha);
        } else if (param.m_model.equals("LRHTMM")) {
            model = new LRHTMM(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_numTopics, param.m_alpha, param.m_C);
        } else {
            System.out.println("The specified topic model has not been developed yet!");
            System.exit(-1);
        }
        if (param.m_CVFold <= 1) {
            model.EMonCorpus();
            // fixed: print top 10 words
            model.printTopWords(10);
        } else
            model.crossValidation(param.m_CVFold);
    } else if (param.m_style.equals("FV")) {
        corpus.save2File(param.m_fvFile);
        System.out.format("Vectors saved to %s...\n", param.m_fvFile);
    } else
        System.out.println("Learning paradigm has not developed yet!");
}
Also used : PRLogisticRegression(Classifier.supervised.PRLogisticRegression) GaussianFieldsByRandomWalk(Classifier.semisupervised.GaussianFieldsByRandomWalk) DocAnalyzer(Analyzer.DocAnalyzer) topicmodels.twoTopic(topicmodels.twoTopic) VctAnalyzer(Analyzer.VctAnalyzer) SVM(Classifier.supervised.SVM) DocAnalyzer(Analyzer.DocAnalyzer) VctAnalyzer(Analyzer.VctAnalyzer) Analyzer(Analyzer.Analyzer) TopicModel(topicmodels.TopicModel) LDA_Variational(topicmodels.LDA.LDA_Variational) HTMM(topicmodels.markovmodel.HTMM) LRHTMM(topicmodels.markovmodel.LRHTMM) structures._Corpus(structures._Corpus) NaiveBayes(Classifier.supervised.NaiveBayes) BaseClassifier(Classifier.BaseClassifier) LDA_Variational_multithread(topicmodels.multithreads.LDA.LDA_Variational_multithread) topicmodels.multithreads.pLSA.pLSA_multithread(topicmodels.multithreads.pLSA.pLSA_multithread) PRLogisticRegression(Classifier.supervised.PRLogisticRegression) LogisticRegression(Classifier.supervised.LogisticRegression) topicmodels.pLSA.pLSA(topicmodels.pLSA.pLSA) LRHTMM(topicmodels.markovmodel.LRHTMM) PageRank(influence.PageRank) LinearSVMMetricLearning(Classifier.metricLearning.LinearSVMMetricLearning) LDA_Gibbs(topicmodels.LDA.LDA_Gibbs) Parameter(structures.Parameter) GaussianFields(Classifier.semisupervised.GaussianFields) File(java.io.File)

Example 2 with VctAnalyzer

use of Analyzer.VctAnalyzer in project IR_Base by Linda-sunshine.

the class VectorReviewMain method main.

public static void main(String[] args) throws IOException, ParseException {
    /**
     ***Set these parameters before run the classifiers.****
     */
    // Define the number of classes in this Naive Bayes.
    int classNumber = 5;
    // Document length threshold
    int lengthThreshold = 5;
    // k fold-cross validation
    int CVFold = 10;
    // Supervised classification models: "NB", "LR", "PR-LR", "SVM"
    // Semi-supervised classification models: "GF", "GF-RW", "GF-RW-ML"
    // Which classifier to use.
    String classifier = "GF-RW-ML";
    // String modelPath = "./data/Model/";
    double C = 1.0;
    // "SUP", "SEMI"
    String style = "SEMI";
    String multipleLearner = "SVM";
    /**
     ***The parameters used in loading files.****
     */
    String featureLocation = "data/Features/fv_2gram_BM25_CHI_small.txt";
    String vctfile = "data/FVs/vct_2gram_BM25_CHI_tablet_small.dat";
    // String featureLocation = "data/Features/fv_fake.txt";
    // String vctfile = "data/Fvs/LinearRegression.dat";
    /**
     ***Parameters in time series analysis.****
     */
    // String debugOutput = String.format("data/debug/%s.sim.pair", classifier);
    String debugOutput = null;
    /**
     **Pre-process the data.****
     */
    // Feture selection.
    System.out.println("Loading vectors from file, wait...");
    VctAnalyzer analyzer = new VctAnalyzer(classNumber, lengthThreshold, featureLocation);
    // Load all the documents as the data set.
    analyzer.LoadDoc(vctfile);
    _Corpus corpus = analyzer.getCorpus();
    // make it binary
    corpus.mapLabels(4);
    /**
     ******Choose different classification methods.********
     */
    if (style.equals("SUP")) {
        if (classifier.equals("NB")) {
            // Define a new naive bayes with the parameters.
            System.out.println("Start naive bayes, wait...");
            NaiveBayes myNB = new NaiveBayes(corpus);
            // Use the movie reviews for testing the codes.
            myNB.crossValidation(CVFold, corpus);
        } else if (classifier.equals("KNN")) {
            // Define a new naive bayes with the parameters.
            System.out.println("Start kNN, wait...");
            KNN myKNN = new KNN(corpus, 10, 1);
            // Use the movie reviews for testing the codes.
            myKNN.crossValidation(CVFold, corpus);
        } else if (classifier.equals("LR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start logistic regression, wait...");
            LogisticRegression myLR = new LogisticRegression(corpus, C);
            myLR.setDebugOutput(debugOutput);
            // Use the movie reviews for testing the codes.
            myLR.crossValidation(CVFold, corpus);
        // myLR.saveModel(modelPath + "LR.model");
        } else if (classifier.equals("PRLR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start posterior regularized logistic regression, wait...");
            PRLogisticRegression myLR = new PRLogisticRegression(corpus, C);
            myLR.setDebugOutput(debugOutput);
            // Use the movie reviews for testing the codes.
            myLR.crossValidation(CVFold, corpus);
        // myLR.saveModel(modelPath + "LR.model");
        } else if (classifier.equals("SVM")) {
            System.out.println("Start SVM, wait...");
            SVM mySVM = new SVM(corpus, C);
            mySVM.crossValidation(CVFold, corpus);
        } else if (classifier.equals("PR")) {
            System.out.println("Start PageRank, wait...");
            PageRank myPR = new PageRank(corpus, C, 100, 50, 1e-6);
            myPR.train(corpus.getCollection());
        } else
            System.out.println("Classifier has not been developed yet!");
    } else if (style.equals("SEMI")) {
        double learningRatio = 1.0;
        // k nearest labeled, k' nearest unlabeled
        int k = 20, kPrime = 20;
        // labeled data weight, unlabeled data weight
        double tAlpha = 1.0, tBeta = 0.1;
        // convergence of random walk, weight of random walk
        double tDelta = 1e-4, tEta = 0.5;
        boolean simFlag = false;
        double threshold = 0.5;
        // bound for generating rating constraints (must be zero in binary case)
        int bound = 0;
        boolean metricLearning = true;
        if (classifier.equals("GF")) {
            GaussianFields mySemi = new GaussianFields(corpus, multipleLearner, C);
            mySemi.crossValidation(CVFold, corpus);
        } else if (classifier.equals("GF-RW")) {
            GaussianFields mySemi = new GaussianFieldsByRandomWalk(corpus, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, false);
            mySemi.setDebugOutput(debugOutput);
            mySemi.crossValidation(CVFold, corpus);
        } else if (classifier.equals("GF-RW-ML")) {
            LinearSVMMetricLearning lMetricLearner = new LinearSVMMetricLearning(corpus, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, false, bound);
            lMetricLearner.setMetricLearningMethod(metricLearning);
            lMetricLearner.setDebugOutput(debugOutput);
            lMetricLearner.crossValidation(CVFold, corpus);
        } else
            System.out.println("Classifier has not been developed yet!");
    } else
        System.out.println("Learning paradigm has not been developed yet!");
}
Also used : PRLogisticRegression(Classifier.supervised.PRLogisticRegression) GaussianFieldsByRandomWalk(Classifier.semisupervised.GaussianFieldsByRandomWalk) PageRank(influence.PageRank) VctAnalyzer(Analyzer.VctAnalyzer) KNN(Classifier.supervised.KNN) SVM(Classifier.supervised.SVM) structures._Corpus(structures._Corpus) NaiveBayes(Classifier.supervised.NaiveBayes) LinearSVMMetricLearning(Classifier.metricLearning.LinearSVMMetricLearning) GaussianFields(Classifier.semisupervised.GaussianFields) PRLogisticRegression(Classifier.supervised.PRLogisticRegression) LogisticRegression(Classifier.supervised.LogisticRegression)

Aggregations

VctAnalyzer (Analyzer.VctAnalyzer)2 LinearSVMMetricLearning (Classifier.metricLearning.LinearSVMMetricLearning)2 GaussianFields (Classifier.semisupervised.GaussianFields)2 GaussianFieldsByRandomWalk (Classifier.semisupervised.GaussianFieldsByRandomWalk)2 LogisticRegression (Classifier.supervised.LogisticRegression)2 NaiveBayes (Classifier.supervised.NaiveBayes)2 PRLogisticRegression (Classifier.supervised.PRLogisticRegression)2 SVM (Classifier.supervised.SVM)2 PageRank (influence.PageRank)2 structures._Corpus (structures._Corpus)2 Analyzer (Analyzer.Analyzer)1 DocAnalyzer (Analyzer.DocAnalyzer)1 BaseClassifier (Classifier.BaseClassifier)1 KNN (Classifier.supervised.KNN)1 File (java.io.File)1 Parameter (structures.Parameter)1 LDA_Gibbs (topicmodels.LDA.LDA_Gibbs)1 LDA_Variational (topicmodels.LDA.LDA_Variational)1 TopicModel (topicmodels.TopicModel)1 HTMM (topicmodels.markovmodel.HTMM)1