Examples with structures._Corpus - structures._Corpus

Example 6 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class TransductiveMain method main.

public static void main(String[] args) throws IOException, ParseException {
    // Define the number of classes in this Naive Bayes.
    int classNumber = 5;
    // The default value is unigram.
    int Ngram = 2;
    // Document length threshold
    int lengthThreshold = 5;
    // each sentence should have at least 2 sentences for HTSM, LRSHTM
    int minimunNumberofSentence = 2;
    /**
     ***parameters for the two-topic topic model****
     */
    // pLSA, LDA_Gibbs, LDA_Variational
    String topicmodel = "pLSA";
    int number_of_topics = 30;
    // these two parameters must be larger than 1!!!
    double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = 5.0;
    // negative converge means do need to check likelihood convergency
    double converge = -1, lambda = 0.7;
    int number_of_iteration = 100;
    boolean aspectSentiPrior = true;
    /**
     ***The parameters used in loading files.****
     */
    String folder = "./data/amazon/tablet/topicmodel";
    String suffix = ".json";
    String stopword = "./data/Model/stopwords.dat";
    // Token model.
    String tokenModel = "./data/Model/en-token.bin";
    // Sentence model. Need it for pos tagging.
    String stnModel = "./data/Model/en-sent.bin";
    String tagModel = "./data/Model/en-pos-maxent.bin";
    String sentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
    // Added by Mustafizur----------------
    String pathToPosWords = "./data/Model/SentiWordsPos.txt";
    String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
    String pathToNegationWords = "./data/Model/negation_words.txt";
    // String category = "tablets"; //"electronics"
    // String dataSize = "86jsons"; //"50K", "100K"
    // String fvFile = String.format("./data/Features/fv_%dgram_%s_%s.txt", Ngram, category, dataSize);
    // String fvStatFile = String.format("./data/Features/fv_%dgram_stat_%s_%s.txt", Ngram, category, dataSize);
    // String aspectlist = "./data/Model/aspect_output_simple.txt";
    String fvFile = String.format("./data/Features/fv_%dgram_topicmodel.txt", Ngram);
    String fvStatFile = String.format("./data/Features/fv_%dgram_stat_topicmodel.txt", Ngram);
    String aspectSentiList = "./data/Model/aspect_sentiment_tablet.txt";
    String aspectList = "./data/Model/aspect_tablet.txt";
    /**
     ***Parameters in learning style.****
     */
    // "SUP", "SEMI"
    String style = "SEMI";
    // "RW", "RW-ML", "RW-L2R"
    String method = "RW-L2R";
    /**
     ***Parameters in transductive learning.****
     */
    String debugOutput = "data/debug/topical.sim";
    // String debugOutput = null;
    boolean releaseContent = false;
    // k fold-cross validation
    int CVFold = 10;
    // choice of base learner
    String multipleLearner = "SVM";
    // trade-off parameter
    double C = 1.0;
    /**
     ***Parameters in feature selection.****
     */
    // String featureSelection = "DF"; //Feature selection method.
    // double startProb = 0.5; // Used in feature selection, the starting point of the features.
    // double endProb = 0.999; // Used in feature selection, the ending point of the features.
    // int DFthreshold = 30; // Filter the features with DFs smaller than this threshold.
    // 
    // System.out.println("Performing feature selection, wait...");
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
    // analyzer.LoadStopwords(stopwords);
    // analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
    // analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
    System.out.println("Creating feature vectors, wait...");
    AspectAnalyzer analyzer = new AspectAnalyzer(tokenModel, stnModel, tagModel, classNumber, fvFile, Ngram, lengthThreshold, aspectList, true);
    // Added by Mustafizur----------------
    analyzer.setMinimumNumberOfSentences(minimunNumberofSentence);
    // Load the sentiwordnet file.
    analyzer.LoadStopwords(stopword);
    // analyzer.loadPriorPosNegWords(sentiWordNet, pathToPosWords, pathToNegWords, pathToNegationWords);
    analyzer.setReleaseContent(releaseContent);
    // Added by Mustafizur----------------
    // Load all the documents as the data set.
    analyzer.LoadDirectory(folder, suffix);
    analyzer.setFeatureValues("TF", 0);
    // Get the collection of all the documents.
    _Corpus c = analyzer.returnCorpus(fvStatFile);
    pLSA tModel = null;
    if (topicmodel.equals("pLSA")) {
        tModel = new pLSA_multithread(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha);
    } else if (topicmodel.equals("LDA_Gibbs")) {
        tModel = new LDA_Gibbs(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha, 0.4, 50);
    } else if (topicmodel.equals("LDA_Variational")) {
        tModel = new LDA_Variational_multithread(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha, 10, -1);
    } else {
        System.out.println("The selected topic model has not developed yet!");
        return;
    }
    tModel.setDisplayLap(0);
    tModel.setSentiAspectPrior(aspectSentiPrior);
    tModel.LoadPrior(aspectSentiPrior ? aspectSentiList : aspectList, eta);
    tModel.EMonCorpus();
    // construct effective feature values for supervised classifiers
    analyzer.setFeatureValues("BM25", 2);
    // how to set this reasonably
    c.mapLabels(3);
    if (style.equals("SEMI")) {
        // perform transductive learning
        System.out.println("Start Transductive Learning, wait...");
        double learningRatio = 1.0;
        // k nearest labeled, k' nearest unlabeled
        int k = 30, kPrime = 20;
        // labeled data weight, unlabeled data weight
        double tAlpha = 1.0, tBeta = 0.1;
        // convergence of random walk, weight of random walk
        double tDelta = 1e-5, tEta = 0.6;
        boolean simFlag = false, weightedAvg = true;
        // bound for generating rating constraints (must be zero in binary case)
        int bound = 0;
        // top K similar documents for constructing pairwise ranking targets
        int topK = 25;
        double noiseRatio = 1.0;
        boolean metricLearning = true;
        // training LambdaRank with multi-threads
        boolean multithread_LR = true;
        GaussianFieldsByRandomWalk mySemi = null;
        if (method.equals("RW")) {
            mySemi = new GaussianFieldsByRandomWalk(c, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, weightedAvg);
        } else if (method.equals("RW-ML")) {
            mySemi = new LinearSVMMetricLearning(c, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, weightedAvg, bound);
            ((LinearSVMMetricLearning) mySemi).setMetricLearningMethod(metricLearning);
        } else if (method.equals("RW-L2R")) {
            mySemi = new L2RMetricLearning(c, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, weightedAvg, topK, noiseRatio, multithread_LR);
        }
        mySemi.setSimilarity(simFlag);
        mySemi.setDebugOutput(debugOutput);
        mySemi.crossValidation(CVFold, c);
    } else if (style.equals("SUP")) {
        // perform supervised learning
        System.out.println("Start SVM, wait...");
        SVM mySVM = new SVM(c, C);
        mySVM.crossValidation(CVFold, c);
    }
}

Also used : GaussianFieldsByRandomWalk(Classifier.semisupervised.GaussianFieldsByRandomWalk) SVM(Classifier.supervised.SVM) AspectAnalyzer(Analyzer.AspectAnalyzer) L2RMetricLearning(Classifier.metricLearning.L2RMetricLearning) structures._Corpus(structures._Corpus) LinearSVMMetricLearning(Classifier.metricLearning.LinearSVMMetricLearning) LDA_Gibbs(topicmodels.LDA.LDA_Gibbs) LDA_Variational_multithread(topicmodels.multithreads.LDA.LDA_Variational_multithread) topicmodels.multithreads.pLSA.pLSA_multithread(topicmodels.multithreads.pLSA.pLSA_multithread) topicmodels.pLSA.pLSA(topicmodels.pLSA.pLSA)

Example 7 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class AmazonReviewMain method main.

public static void main(String[] args) throws IOException, ParseException {
    /**
     ***Set these parameters before run the classifiers.****
     */
    // Define the number of classes
    int classNumber = 5;
    // The default value is bigram.
    int Ngram = 2;
    // Document length threshold
    int lengthThreshold = 10;
    // "TF", "TFIDF", "BM25", "PLN"
    // The way of calculating the feature value, which can also be "TFIDF", "BM25"
    String featureValue = "TF";
    // The way of normalization.(only 1 and 2)
    int norm = 0;
    // k fold-cross validation
    int CVFold = 10;
    // "SUP", "SEMI", "FV", "ASPECT"
    String style = "SUP";
    // "NB", "LR", "SVM", "PR"
    // Which classifier to use.
    String classifier = "SVM";
    // "GF", "NB-EM"
    String model = "SVM";
    double C = 1.0;
    // String modelPath = "./data/Model/";
    // "data/debug/LR.output";
    String debugOutput = null;
    System.out.println("--------------------------------------------------------------------------------------");
    System.out.println("Parameters of this run:" + "\nClassNumber: " + classNumber + "\tNgram: " + Ngram + "\tFeatureValue: " + featureValue + "\tLearning Method: " + style + "\tClassifier: " + classifier + "\nCross validation: " + CVFold);
    // /*****Parameters in feature selection.*****/
    // Feature selection method.
    String featureSelection = "CHI";
    String stopwords = "./data/Model/stopwords.dat";
    // Used in feature selection, the starting point of the features.
    double startProb = 0.5;
    // Used in feature selection, the ending point of the features.
    double endProb = 0.999;
    // Filter the features with DFs smaller than this threshold.
    int maxDF = -1, minDF = 1;
    // System.out.println("Feature Seleciton: " + featureSelection + "\tStarting probability: " + startProb + "\tEnding probability:" + endProb);
    /**
     ***The parameters used in loading files.****
     */
    String folder = "./data/amazon/tablet/small";
    String suffix = ".json";
    // Token model
    String tokenModel = "./data/Model/en-token.bin";
    String pattern = String.format("%dgram_%s", Ngram, featureSelection);
    String fvFile = String.format("data/Features/fv_%s_small.txt", pattern);
    String fvStatFile = String.format("data/Features/fv_stat_%s_small.txt", pattern);
    String vctFile = String.format("data/Fvs/vct_%s_tablet_small.dat", pattern);
    /**
     ***Parameters in time series analysis.****
     */
    int window = 0;
    System.out.println("Window length: " + window);
    System.out.println("--------------------------------------------------------------------------------------");
    // /****Loading json files*****/
    DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
    analyzer.LoadStopwords(stopwords);
    // Load all the documents as the data set.
    analyzer.LoadDirectory(folder, suffix);
    // /****Feature selection*****/
    System.out.println("Performing feature selection, wait...");
    // Select the features.
    analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, maxDF, minDF);
    analyzer.SaveCVStat(fvStatFile);
    /**
     **create vectors for documents****
     */
    System.out.println("Creating feature vectors, wait...");
    // jsonAnalyzer
    analyzer = new DocAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
    // Just for debugging purpose: all the other classifiers do not need content
    analyzer.setReleaseContent(!(classifier.equals("PR") || debugOutput != null));
    // Load all the documents as the data set.
    analyzer.LoadDirectory(folder, suffix);
    analyzer.setFeatureValues(featureValue, norm);
    // //		analyzer.setTimeFeatures(window);
    _Corpus corpus = analyzer.getCorpus();
    // Execute different classifiers.
    if (style.equals("SUP")) {
        if (classifier.equals("NB")) {
            // Define a new naive bayes with the parameters.
            System.out.println("Start naive bayes, wait...");
            NaiveBayes myNB = new NaiveBayes(corpus);
            // Use the movie reviews for testing the codes.
            myNB.crossValidation(CVFold, corpus);
        } else if (classifier.equals("LR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start logistic regression, wait...");
            LogisticRegression myLR = new LogisticRegression(corpus, C);
            myLR.setDebugOutput(debugOutput);
            // Use the movie reviews for testing the codes.
            myLR.crossValidation(CVFold, corpus);
        // myLR.saveModel(modelPath + "LR.model");
        } else if (classifier.equals("SVM")) {
            System.out.println("Start SVM, wait...");
            SVM mySVM = new SVM(corpus, C);
            mySVM.crossValidation(CVFold, corpus);
        } else if (classifier.equals("PR")) {
            System.out.println("Start PageRank, wait...");
            PageRank myPR = new PageRank(corpus, C, 100, 50, 1e-6);
            myPR.train(corpus.getCollection());
        } else
            System.out.println("Classifier has not developed yet!");
    } else if (style.equals("SEMI")) {
        if (model.equals("GF")) {
            System.out.println("Start Gaussian Field, wait...");
            GaussianFields mySemi = new GaussianFields(corpus, classifier, C);
            mySemi.crossValidation(CVFold, corpus);
        } else if (model.equals("NB-EM")) {
            // corpus.setUnlabeled();
            System.out.println("Start Naive Bayes with EM, wait...");
            NaiveBayesEM myNB = new NaiveBayesEM(corpus);
            // Use the movie reviews for testing the codes.
            myNB.crossValidation(CVFold, corpus);
        }
    } else if (style.equals("FV")) {
        corpus.save2File(vctFile);
        System.out.format("Vectors saved to %s...\n", vctFile);
    } else
        System.out.println("Learning paradigm has not developed yet!");
}

Also used : structures._Corpus(structures._Corpus) NaiveBayes(Classifier.supervised.NaiveBayes) DocAnalyzer(Analyzer.DocAnalyzer) PageRank(influence.PageRank) NaiveBayesEM(Classifier.semisupervised.NaiveBayesEM) GaussianFields(Classifier.semisupervised.GaussianFields) SVM(Classifier.supervised.SVM) LogisticRegression(Classifier.supervised.LogisticRegression)

Example 8 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class Execution method main.

public static void main(String[] args) throws IOException, ParseException {
    Parameter param = new Parameter(args);
    System.out.println(param.toString());
    String stnModel = (param.m_model.equals("HTMM") || param.m_model.equals("LRHTMM")) ? param.m_stnModel : null;
    String posModel = (param.m_model.equals("HTMM") || param.m_model.equals("LRHTMM")) ? param.m_posModel : null;
    _Corpus corpus;
    Analyzer analyzer;
    /**
     *Load the data from vector file**
     */
    if (param.m_fvFile != null && (new File(param.m_fvFile)).exists()) {
        analyzer = new VctAnalyzer(param.m_classNumber, param.m_lengthThreshold, param.m_featureFile);
        // Load all the documents as the data set.
        analyzer.LoadDoc(param.m_fvFile);
        corpus = analyzer.getCorpus();
    } else {
        /**
         *Load the data from text file**
         */
        analyzer = new DocAnalyzer(param.m_tokenModel, stnModel, posModel, param.m_classNumber, param.m_featureFile, param.m_Ngram, param.m_lengthThreshold);
        ((DocAnalyzer) analyzer).setReleaseContent(!param.m_weightScheme.equals("PR"));
        if (param.m_featureFile == null) {
            /**
             **Pre-process the data.****
             */
            // Feture selection.
            System.out.println("Performing feature selection, wait...");
            param.m_featureFile = String.format("./data/Features/%s_fv.dat", param.m_featureSelection);
            param.m_featureStat = String.format("./data/Features/%s_fv_stat.dat", param.m_featureSelection);
            System.out.println(param.printFeatureSelectionConfiguration());
            ((DocAnalyzer) analyzer).LoadStopwords(param.m_stopwords);
            // Load all the documents as the data set.
            analyzer.LoadDirectory(param.m_folder, param.m_suffix);
            // Select the features.
            analyzer.featureSelection(param.m_featureFile, param.m_featureSelection, param.m_startProb, param.m_endProb, param.m_maxDF, param.m_minDF);
        }
        // Collect vectors for documents.
        System.out.println("Creating feature vectors, wait...");
        // Load all the documents as the data set.
        analyzer.LoadDirectory(param.m_folder, param.m_suffix);
        analyzer.setFeatureValues(param.m_featureValue, param.m_norm);
        corpus = analyzer.returnCorpus(param.m_featureStat);
    }
    if (param.m_weightScheme.equals("PR")) {
        System.out.println("Creating PageRank instance weighting, wait...");
        PageRank myPR = new PageRank(corpus, param.m_C, 100, 50, 1e-6);
        myPR.train(corpus.getCollection());
    }
    // Execute different classifiers.
    if (param.m_style.equals("SUP")) {
        BaseClassifier model = null;
        if (param.m_model.equals("NB")) {
            // Define a new naive bayes with the parameters.
            System.out.println("Start naive bayes, wait...");
            model = new NaiveBayes(corpus);
        } else if (param.m_model.equals("LR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start logistic regression, wait...");
            model = new LogisticRegression(corpus, param.m_C);
        } else if (param.m_model.equals("PR-LR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start posterior regularized logistic regression, wait...");
            model = new PRLogisticRegression(corpus, param.m_C);
        } else if (param.m_model.equals("SVM")) {
            // corpus.save2File("data/FVs/fvector.dat");
            System.out.println("Start SVM, wait...");
            model = new SVM(corpus, param.m_C);
        } else {
            System.out.println("Classifier has not been developed yet!");
            System.exit(-1);
        }
        model.setDebugOutput(param.m_debugOutput);
        model.crossValidation(param.m_CVFold, corpus);
    } else if (param.m_style.equals("SEMI")) {
        BaseClassifier model = null;
        if (param.m_model.equals("GF")) {
            System.out.println("Start Gaussian Field by matrix inversion, wait...");
            model = new GaussianFields(corpus, param.m_classifier, param.m_C, param.m_sampleRate, param.m_kUL, param.m_kUU);
        } else if (param.m_model.equals("GF-RW")) {
            System.out.println("Start Gaussian Field by random walk, wait...");
            model = new GaussianFieldsByRandomWalk(corpus, param.m_classifier, param.m_C, param.m_sampleRate, param.m_kUL, param.m_kUU, param.m_alpha, param.m_beta, param.m_converge, param.m_eta, param.m_weightedAvg);
        } else if (param.m_model.equals("GF-RW-ML")) {
            System.out.println("Start Gaussian Field with distance metric learning by random walk, wait...");
            model = new LinearSVMMetricLearning(corpus, param.m_classifier, param.m_C, param.m_sampleRate, param.m_kUL, param.m_kUU, param.m_alpha, param.m_beta, param.m_converge, param.m_eta, param.m_weightedAvg, param.m_bound);
        // ((LinearSVMMetricLearning)model).setMetricLearningMethod(false);
        // ((LinearSVMMetricLearning)model).verification(param.m_CVFold, corpus, param.m_debugOutput);
        } else {
            System.out.println("Classifier has not been developed yet!");
            System.exit(-1);
        }
        model.setDebugOutput(param.m_debugOutput);
        model.crossValidation(param.m_CVFold, corpus);
    } else if (param.m_style.equals("TM")) {
        TopicModel model = null;
        if (param.m_model.equals("2topic")) {
            model = new twoTopic(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda);
        } else if (param.m_model.equals("pLSA")) {
            if (param.m_multithread == false) {
                model = new pLSA(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha);
            } else {
                model = new pLSA_multithread(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha);
            }
            ((pLSA) model).LoadPrior(param.m_priorFile, param.m_gamma);
        } else if (param.m_model.equals("vLDA")) {
            if (param.m_multithread == false) {
                model = new LDA_Variational(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha, param.m_maxVarIterations, param.m_varConverge);
            } else {
                model = new LDA_Variational_multithread(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha, param.m_maxVarIterations, param.m_varConverge);
            }
            ((LDA_Variational) model).LoadPrior(param.m_priorFile, param.m_gamma);
        } else if (param.m_model.equals("gLDA")) {
            model = new LDA_Gibbs(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_lambda, param.m_numTopics, param.m_alpha, param.m_burnIn, param.m_lag);
            ((LDA_Gibbs) model).LoadPrior(param.m_priorFile, param.m_gamma);
        } else if (param.m_model.equals("HTMM")) {
            model = new HTMM(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_numTopics, param.m_alpha);
        } else if (param.m_model.equals("LRHTMM")) {
            model = new LRHTMM(param.m_maxmIterations, param.m_converge, param.m_beta, corpus, param.m_numTopics, param.m_alpha, param.m_C);
        } else {
            System.out.println("The specified topic model has not been developed yet!");
            System.exit(-1);
        }
        if (param.m_CVFold <= 1) {
            model.EMonCorpus();
            // fixed: print top 10 words
            model.printTopWords(10);
        } else
            model.crossValidation(param.m_CVFold);
    } else if (param.m_style.equals("FV")) {
        corpus.save2File(param.m_fvFile);
        System.out.format("Vectors saved to %s...\n", param.m_fvFile);
    } else
        System.out.println("Learning paradigm has not developed yet!");
}

Also used : PRLogisticRegression(Classifier.supervised.PRLogisticRegression) GaussianFieldsByRandomWalk(Classifier.semisupervised.GaussianFieldsByRandomWalk) DocAnalyzer(Analyzer.DocAnalyzer) topicmodels.twoTopic(topicmodels.twoTopic) VctAnalyzer(Analyzer.VctAnalyzer) SVM(Classifier.supervised.SVM) DocAnalyzer(Analyzer.DocAnalyzer) VctAnalyzer(Analyzer.VctAnalyzer) Analyzer(Analyzer.Analyzer) TopicModel(topicmodels.TopicModel) LDA_Variational(topicmodels.LDA.LDA_Variational) HTMM(topicmodels.markovmodel.HTMM) LRHTMM(topicmodels.markovmodel.LRHTMM) structures._Corpus(structures._Corpus) NaiveBayes(Classifier.supervised.NaiveBayes) BaseClassifier(Classifier.BaseClassifier) LDA_Variational_multithread(topicmodels.multithreads.LDA.LDA_Variational_multithread) topicmodels.multithreads.pLSA.pLSA_multithread(topicmodels.multithreads.pLSA.pLSA_multithread) PRLogisticRegression(Classifier.supervised.PRLogisticRegression) LogisticRegression(Classifier.supervised.LogisticRegression) topicmodels.pLSA.pLSA(topicmodels.pLSA.pLSA) LRHTMM(topicmodels.markovmodel.LRHTMM) PageRank(influence.PageRank) LinearSVMMetricLearning(Classifier.metricLearning.LinearSVMMetricLearning) LDA_Gibbs(topicmodels.LDA.LDA_Gibbs) Parameter(structures.Parameter) GaussianFields(Classifier.semisupervised.GaussianFields) File(java.io.File)

Example 9 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class MovieReviewMain method main.

/**
 ***************************Main function******************************
 */
public static void main(String[] args) throws IOException {
    _Corpus corpus = new _Corpus();
    /**
     ***Set these parameters before run the classifiers.****
     */
    // Initialize the fetureSize to be zero at first.
    int featureSize = 0;
    // Define the number of classes in this Naive Bayes.
    int classNumber = 2;
    // The default value is unigram.
    int Ngram = 1;
    // Document length threshold
    int lengthThreshold = 5;
    // The way of calculating the feature value, which can also be "TFIDF", "BM25"
    String featureValue = "TF";
    int norm = 1;
    // Which classifier to use.
    String classifier = "SVM";
    System.out.println("--------------------------------------------------------------------------------------");
    System.out.println("Parameters of this run:" + "\nClassNumber: " + classNumber + "\tNgram: " + Ngram + "\tFeatureValue: " + featureValue + "\tClassifier: " + classifier);
    /**
     ***The parameters used in loading files.****
     */
    String folder = "data/txt_sentoken";
    String suffix = ".txt";
    // Token model.
    String tokenModel = "./data/Model/en-token.bin";
    // String finalLocation = "/Users/lingong/Documents/Lin'sWorkSpace/IR_Base/data/movie/FinalFeatureStat.txt"; //The destination of storing the final features with stats.
    // String featureLocation = "/Users/lingong/Documents/Lin'sWorkSpace/IR_Base/data/movie/SelectedFeatures.txt";
    String finalLocation = "/home/lin/Lin'sWorkSpace/IR_Base/FinalFeatureStat.txt";
    String featureLocation = "/home/lin/Lin'sWorkSpace/IR_Base/SelectedFeatures.txt";
    /**
     ***Paramters in feature selection.****
     */
    // String providedCV = "";
    String featureSelection = "";
    // Provided CV.
    String providedCV = "Features.txt";
    // String featureSelection = "MI"; //Feature selection method.
    // Used in feature selection, the starting point of the features.
    double startProb = 0.5;
    // Used in feature selection, the ending point of the features.
    double endProb = 1;
    // Filter the features with DFs smaller than this threshold.
    int maxDF = -1, minDF = 5;
    System.out.println("Feature Seleciton: " + featureSelection + "\tStarting probability: " + startProb + "\tEnding probability:" + endProb);
    System.out.println("--------------------------------------------------------------------------------------");
    if (providedCV.isEmpty() && featureSelection.isEmpty()) {
        // Case 1: no provided CV, no feature selection.
        System.out.println("Case 1: no provided CV, no feature selection.  Start loading files, wait...");
        DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
        // Load all the documents as the data set.
        analyzer.LoadDirectory(folder, suffix);
        analyzer.setFeatureValues(featureValue, norm);
        corpus = analyzer.returnCorpus(finalLocation);
    } else if (!providedCV.isEmpty() && featureSelection.isEmpty()) {
        // Case 2: provided CV, no feature selection.
        System.out.println("Case 2: provided CV, no feature selection. Start loading files, wait...");
        DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, providedCV, Ngram, lengthThreshold);
        // Load all the documents as the data set.
        analyzer.LoadDirectory(folder, suffix);
        analyzer.setFeatureValues(featureValue, norm);
        corpus = analyzer.returnCorpus(finalLocation);
    } else if (providedCV.isEmpty() && !featureSelection.isEmpty()) {
        // Case 3: no provided CV, feature selection.
        System.out.println("Case 3: no provided CV, feature selection. Start loading files to do feature selection, wait...");
        DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
        // Load all the documents as the data set.
        analyzer.LoadDirectory(folder, suffix);
        // Select the features.
        analyzer.featureSelection(featureLocation, featureSelection, startProb, endProb, maxDF, minDF);
        System.out.println("Start loading files, wait...");
        analyzer = new DocAnalyzer(tokenModel, classNumber, featureLocation, Ngram, lengthThreshold);
        analyzer.LoadDirectory(folder, suffix);
        analyzer.setFeatureValues(featureValue, norm);
        corpus = analyzer.returnCorpus(finalLocation);
    } else if (!providedCV.isEmpty() && !featureSelection.isEmpty()) {
        // Case 4: provided CV, feature selection.
        DocAnalyzer analyzer = new DocAnalyzer(tokenModel, classNumber, providedCV, Ngram, lengthThreshold);
        System.out.println("Case 4: provided CV, feature selection. Start loading files to do feature selection, wait...");
        // Load all the documents as the data set.
        analyzer.LoadDirectory(folder, suffix);
        // Select the features.
        analyzer.featureSelection(featureLocation, featureSelection, startProb, endProb, maxDF, minDF);
        System.out.println("Start loading files, wait...");
        analyzer = new DocAnalyzer(tokenModel, classNumber, featureLocation, Ngram, lengthThreshold);
        analyzer.LoadDirectory(folder, suffix);
        analyzer.setFeatureValues(featureValue, norm);
        corpus = analyzer.returnCorpus(finalLocation);
    } else
        System.out.println("The setting fails, please check the parameters!!");
    // Execute different classifiers.
    if (classifier.equals("NB")) {
        // Define a new naive bayes with the parameters.
        System.out.println("Start naive bayes, wait...");
        NaiveBayes myNB = new NaiveBayes(corpus);
        // Use the movie reviews for testing the codes.
        myNB.crossValidation(10, corpus);
    } else if (classifier.equals("LR")) {
        // Define a new lambda.
        double lambda = 0;
        // Define a new logistics regression with the parameters.
        System.out.println("Start logistic regression, wait...");
        LogisticRegression myLR = new LogisticRegression(corpus, lambda);
        // Use the movie reviews for testing the codes.
        myLR.crossValidation(10, corpus);
    } else if (classifier.equals("SVM")) {
        // corpus.save2File("data/FVs/fvector.dat");
        // The default value is 1.
        double C = 3;
        // default value from Lin's implementation
        double eps = 0.01;
        System.out.println("Start SVM, wait...");
        SVM mySVM = new SVM(corpus, C);
        mySVM.crossValidation(10, corpus);
    } else
        System.out.println("Have not developed yet!:(");
}

Also used : structures._Corpus(structures._Corpus) NaiveBayes(Classifier.supervised.NaiveBayes) DocAnalyzer(Analyzer.DocAnalyzer) SVM(Classifier.supervised.SVM) LogisticRegression(Classifier.supervised.LogisticRegression)

Example 10 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class BaseClassifier method crossValidation.

// k-fold Cross Validation.
public void crossValidation(int k, _Corpus c) {
    try {
        if (m_debugOutput != null) {
            m_debugWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(m_debugOutput, false), "UTF-8"));
            m_debugWriter.write(this.toString() + "\n");
        }
        c.shuffle(k);
        int[] masks = c.getMasks();
        ArrayList<_Doc> docs = c.getCollection();
        // Use this loop to iterate all the ten folders, set the train set and test set.
        for (int i = 0; i < k; i++) {
            for (int j = 0; j < masks.length; j++) {
                // more for testing
                if (// || masks[j]==(i+3)%k
                masks[j] == (i + 1) % k || masks[j] == (i + 2) % k)
                    m_trainSet.add(docs.get(j));
                else
                    m_testSet.add(docs.get(j));
            // //more for training
            // if(masks[j]==i)
            // m_testSet.add(docs.get(j));
            // else
            // m_trainSet.add(docs.get(j));
            }
            long start = System.currentTimeMillis();
            train();
            double accuracy = test();
            System.out.format("%s Train/Test finished in %.2f seconds with accuracy %.4f and F1 (%s)...\n", this.toString(), (System.currentTimeMillis() - start) / 1000.0, accuracy, getF1String());
            m_trainSet.clear();
            m_testSet.clear();
        }
        calculateMeanVariance(m_precisionsRecalls);
        if (m_debugOutput != null)
            m_debugWriter.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

Also used : structures._Doc(structures._Doc) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter)

Aggregations

structures._Corpus (structures._Corpus)11 File (java.io.File)7 ParentChildAnalyzer (Analyzer.ParentChildAnalyzer)5 SVM (Classifier.supervised.SVM)5 structures._Doc (structures._Doc)5 LogisticRegression (Classifier.supervised.LogisticRegression)4 NaiveBayes (Classifier.supervised.NaiveBayes)4 DocAnalyzer (Analyzer.DocAnalyzer)3 LinearSVMMetricLearning (Classifier.metricLearning.LinearSVMMetricLearning)3 GaussianFields (Classifier.semisupervised.GaussianFields)3 GaussianFieldsByRandomWalk (Classifier.semisupervised.GaussianFieldsByRandomWalk)3 PageRank (influence.PageRank)3 FileOutputStream (java.io.FileOutputStream)3 ArrayList (java.util.ArrayList)3 Calendar (java.util.Calendar)3 LDA_Gibbs (topicmodels.LDA.LDA_Gibbs)3 LDA_Variational_multithread (topicmodels.multithreads.LDA.LDA_Variational_multithread)3 topicmodels.multithreads.pLSA.pLSA_multithread (topicmodels.multithreads.pLSA.pLSA_multithread)3 topicmodels.pLSA.pLSA (topicmodels.pLSA.pLSA)3 VctAnalyzer (Analyzer.VctAnalyzer)2