Search in sources :

Example 11 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC method crossValidation.

public void crossValidation(int k) {
    m_trainSet = new ArrayList<_Doc>();
    m_testSet = new ArrayList<_Doc>();
    double[] perf = null;
    _Corpus parentCorpus = new _Corpus();
    ArrayList<_Doc> docs = m_corpus.getCollection();
    ArrayList<_ParentDoc> parentDocs = new ArrayList<_ParentDoc>();
    for (_Doc d : docs) {
        if (d instanceof _ParentDoc) {
            parentCorpus.addDoc(d);
            parentDocs.add((_ParentDoc) d);
        }
    }
    System.out.println("size of parent docs\t" + parentDocs.size());
    parentCorpus.setMasks();
    if (m_randomFold == true) {
        perf = new double[k];
        parentCorpus.shuffle(k);
        int[] masks = parentCorpus.getMasks();
        for (int i = 0; i < k; i++) {
            for (int j = 0; j < masks.length; j++) {
                if (masks[j] == i) {
                    m_testSet.add(parentDocs.get(j));
                } else {
                    m_trainSet.add(parentDocs.get(j));
                    for (_ChildDoc d : parentDocs.get(j).m_childDocs) {
                        m_trainSet.add(d);
                    }
                }
            }
            // writeFile(i, m_trainSet, m_testSet);
            System.out.println("Fold number " + i);
            infoWriter.println("Fold number " + i);
            System.out.println("Train Set Size " + m_trainSet.size());
            infoWriter.println("Train Set Size " + m_trainSet.size());
            System.out.println("Test Set Size " + m_testSet.size());
            infoWriter.println("Test Set Size " + m_testSet.size());
            long start = System.currentTimeMillis();
            EM();
            perf[i] = Evaluation(i);
            System.out.format("%s Train/Test finished in %.2f seconds...\n", this.toString(), (System.currentTimeMillis() - start) / 1000.0);
            infoWriter.format("%s Train/Test finished in %.2f seconds...\n", this.toString(), (System.currentTimeMillis() - start) / 1000.0);
            if (i < k - 1) {
                m_trainSet.clear();
                m_testSet.clear();
            }
        }
    }
    double mean = Utils.sumOfArray(perf) / k, var = 0;
    for (int i = 0; i < perf.length; i++) var += (perf[i] - mean) * (perf[i] - mean);
    var = Math.sqrt(var / k);
    System.out.format("Perplexity %.3f+/-%.3f\n", mean, var);
    infoWriter.format("Perplexity %.3f+/-%.3f\n", mean, var);
}
Also used : structures._Corpus(structures._Corpus) structures._ChildDoc(structures._ChildDoc) structures._Doc(structures._Doc) structures._ParentDoc(structures._ParentDoc) ArrayList(java.util.ArrayList)

Example 12 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class TopicModelMain method main.

public static void main(String[] args) throws IOException, ParseException {
    int mb = 1024 * 1024;
    Runtime rTime = Runtime.getRuntime();
    System.out.println("totalMem\t:" + rTime.totalMemory() / mb);
    // Define the number of classes in this Naive Bayes.
    int classNumber = 5;
    // The default value is unigram.
    int Ngram = 1;
    // The way of calculating the feature value, which can also be "TFIDF", "BM25"
    String featureValue = "TF";
    // The way of normalization.(only 1 and 2)
    int norm = 0;
    // Document length threshold
    int lengthThreshold = 5;
    // each document should have at least 2 sentences
    int minimunNumberofSentence = 2;
    /**
     ***parameters for the two-topic topic model****
     */
    // ACCTM, ACCTM_TwoTheta, ACCTM_C, ACCTM_CZ, ACCTM_CZLR, LDAonArticles, ACCTM_C,
    // correspondence_LDA_Gibbs, LDA_Gibbs_Debug, LDA_Variational_multithread
    // 2topic, pLSA, HTMM, LRHTMM, Tensor, LDA_Gibbs, LDA_Variational, HTSM, LRHTSM,
    // LDAGibbs4AC_test, DCMCorrLDA_multi_E_test,DCMLDA4AC_test, DCMDMCorrLDA_multi_E_test
    // DCMDMCorrLDA_test, DCMDMMCorrLDA_test, corrLDA_Gibbs_test,
    // DCMCorrLDA_Multi_EM, sparseDCMLDA_test, DCMLDA_test, sparseLDA_test, LDA_Gibbs_test
    // sparseClusterDCMLDA, sparseClusterDCMLDA_test
    String topicmodel = "weightedCorrespondenceModel_test";
    String category = "tablet";
    int number_of_topics = 15;
    // false means in training there is no reviews from NewEgg
    boolean loadNewEggInTrain = true;
    // false means no shuffling and true means shuffling
    boolean setRandomFold = true;
    // 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
    int loadAspectSentiPrior = 0;
    // these two parameters must be larger than 1!!!
    double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
    // negative converge means do not need to check likelihood convergency
    double converge = 1e-9, lambda = 0.9;
    int varIter = 10;
    double varConverge = 1e-5;
    int topK = 20, number_of_iteration = 50, crossV = 1;
    int gibbs_iteration = 1000, gibbs_lag = 50;
    int displayLap = 20;
    // gibbs_iteration = 50;
    // gibbs_lag = 20;
    // displayLap = 20;
    double burnIn = 0.4;
    boolean sentence = false;
    // most popular items under each category from Amazon
    // needed for docSummary
    String[] tabletProductList = { "B008GFRDL0" };
    String[] cameraProductList = { "B005IHAIMA" };
    String[] phoneProductList = { "B00COYOAYW" };
    String[] tvProductList = { "B0074FGLUM" };
    /**
     ***The parameters used in loading files.****
     */
    String amazonFolder = "./data/amazon/tablet/topicmodel";
    String newEggFolder = "./data/NewEgg";
    String articleType = "Tech";
    // articleType = "Reuters";
    // articleType = "Gadgets";
    // articleType = "Yahoo";
    // articleType = "APP";
    String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
    // articleFolder = String.format(
    // "./data/ParentChildTopicModel/Reuters",
    // articleType);
    String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
    String suffix = ".json";
    // Token model.
    String tokenModel = "./data/Model/en-token.bin";
    String stnModel = null;
    String posModel = null;
    if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
        // Sentence model.
        stnModel = "./data/Model/en-sent.bin";
        // POS model.
        posModel = "./data/Model/en-pos-maxent.bin";
        sentence = true;
    }
    String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s.txt", Ngram, articleType);
    String fvStatFile = String.format("./data/Features/fv_%dgram_stat_%s_%s.txt", Ngram, articleType, topicmodel);
    String aspectList = "./data/Model/aspect_" + category + ".txt";
    String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
    String pathToPosWords = "./data/Model/SentiWordsPos.txt";
    String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
    String pathToNegationWords = "./data/Model/negation_words.txt";
    String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
    File rootFolder = new File("./data/results");
    if (!rootFolder.exists()) {
        System.out.println("creating root directory" + rootFolder);
        rootFolder.mkdir();
    }
    SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyyMMdd-HHmm");
    String filePrefix = String.format("./data/results/%s", dateFormatter.format(new Date()));
    filePrefix = filePrefix + "-" + topicmodel + "-" + articleType;
    File resultFolder = new File(filePrefix);
    if (!resultFolder.exists()) {
        System.out.println("creating directory" + resultFolder);
        resultFolder.mkdir();
    }
    String outputFile = filePrefix + "/consoleOutput.txt";
    PrintStream printStream = new PrintStream(new FileOutputStream(outputFile));
    System.setOut(printStream);
    String infoFilePath = filePrefix + "/Information.txt";
    // //store top k words distribution over topic
    String topWordPath = filePrefix + "/topWords.txt";
    /**
     ***Parameters in feature selection.****
     */
    String stopwords = "./data/Model/stopwords.dat";
    // Feature selection method.
    String featureSelection = "DF";
    // Used in feature selection, the starting point of the features.
    double startProb = 0.;
    // Used in feature selection, the ending point of the features.
    double endProb = 0.999;
    // Filter the features with DFs smaller than this threshold.
    int DFthreshold = 5;
    System.out.println("Performing feature selection, wait...");
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
    // newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
    // analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
    /**
     *** parent child topic model ****
     */
    ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
    // analyzer.LoadDirectory(commentFolder, suffix);
    if (topicmodel.equals("LDA_APPMerged"))
        articleFolder = String.format("./data/ParentChildTopicModel/%sDescriptionsReviews", articleType);
    // articleFolder = String.format(
    // "./data/ParentChildTopicModel/%sArticles4Merged",
    // articleType);
    // 
    // commentFolder = String.format(
    // "./data/ParentChildTopicModel/%sComments4Merged",
    // articleType);
    // 
    analyzer.LoadParentDirectory(articleFolder, suffix);
    // analyzer.LoadDirectory(articleFolder, suffix);
    analyzer.LoadChildDirectory(commentFolder, suffix);
    // analyzer.LoadChildDirectory(commentFolder, suffix);
    // if((topicmodel."LDA_APP")&&(topicmodel!="LDA_APPMerged"))
    // analyzer.LoadChildDirectory(commentFolder, suffix);
    // analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
    System.out.println("Creating feature vectors, wait...");
    // analyzer.LoadNewEggDirectory(newEggFolder, suffix); //Load all the documents as the data set.
    // analyzer.LoadDirectory(amazonFolder, suffix);
    analyzer.setFeatureValues(featureValue, norm);
    // Get the collection of all the documents.
    _Corpus c = analyzer.returnCorpus(fvStatFile);
    if (topicmodel.equals("2topic")) {
        twoTopic model = new twoTopic(number_of_iteration, converge, beta, c, lambda);
        if (crossV <= 1) {
            for (_Doc d : c.getCollection()) {
                model.inference(d);
                model.printTopWords(topK);
            }
        } else
            model.crossValidation(crossV);
    } else {
        pLSA model = null;
        if (topicmodel.equals("pLSA")) {
            model = new pLSA_multithread(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha);
        } else if (topicmodel.equals("LDA_Gibbs")) {
            // number_of_topics = 15;
            model = new // in gibbs sampling, no need to compute log-likelihood during sampling
            LDA_Gibbs(// in gibbs sampling, no need to compute log-likelihood during sampling
            gibbs_iteration, // in gibbs sampling, no need to compute log-likelihood during sampling
            0, // in gibbs sampling, no need to compute log-likelihood during sampling
            beta, // in gibbs sampling, no need to compute log-likelihood during sampling
            c, lambda, number_of_topics, alpha, burnIn, gibbs_lag);
        } else if (topicmodel.equals("LDA_Variational_multithread")) {
            model = new LDA_Variational_multithread(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha, varIter, varConverge);
        } else if (topicmodel.equals("HTMM")) {
            model = new HTMM(number_of_iteration, converge, beta, c, number_of_topics, alpha);
        } else if (topicmodel.equals("HTSM")) {
            model = new HTSM(number_of_iteration, converge, beta, c, number_of_topics, alpha);
        } else if (topicmodel.equals("LRHTMM")) {
            model = new LRHTMM(number_of_iteration, converge, beta, c, number_of_topics, alpha, lambda);
        } else if (topicmodel.equals("LRHTSM")) {
            model = new LRHTSM(number_of_iteration, converge, beta, c, number_of_topics, alpha, lambda);
        } else if (topicmodel.equals("correspondence_LDA_Gibbs")) {
            double ksi = 800;
            double tau = 0.7;
            model = new // in gibbs sampling, no need to compute log-likelihood during sampling
            corrLDA_Gibbs(// in gibbs sampling, no need to compute log-likelihood during sampling
            gibbs_iteration, // in gibbs sampling, no need to compute log-likelihood during sampling
            0, // in gibbs sampling, no need to compute log-likelihood during sampling
            beta - 1, // in gibbs sampling, no need to compute log-likelihood during sampling
            c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag);
        } else if (topicmodel.equals("ACCTM")) {
            double mu = 1.0;
            double[] gamma = { 0.5, 0.5 };
            double ksi = 800;
            double tau = 0.7;
            model = new ACCTM(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag);
        } else if (topicmodel.equals("ACCTM_C")) {
            double mu = 1.0;
            double[] gamma = { 0.5, 0.5 };
            beta = 1.001;
            double ksi = 800;
            double tau = 0.7;
            converge = 1e-5;
            model = new ACCTM_C(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, gamma);
        } else if (topicmodel.equals("ACCTM_CHard")) {
            double mu = 1.0;
            double[] gamma = { 0.5, 0.5 };
            double ksi = 800;
            double tau = 0.7;
            beta = 1.001;
            model = new ACCTM_CHard(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, gamma);
        } else if (topicmodel.equals("ACCTM_CZ")) {
            double mu = 1.0;
            double[] gamma = { 0.5, 0.5 };
            beta = 1.001;
            alpha = 1.01;
            double ksi = 800;
            double tau = 0.7;
            // number_of_topics = 30;
            model = new ACCTM_CZ(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, gamma);
        } else if (topicmodel.equals("ACCTM_CZLR")) {
            double mu = 1.0;
            double[] gamma = { 0.5, 0.5 };
            beta = 1.001;
            alpha = 1.01;
            double ksi = 800;
            double tau = 0.7;
            // number_of_topics = 30;
            converge = 1e-9;
            model = new ACCTM_CZLR(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, gamma);
        } else if (topicmodel.equals("DCMLDA_test")) {
            converge = 1e-3;
            int newtonIter = 50;
            double newtonConverge = 1e-3;
            // number_of_topics = 15;
            model = new DCMLDA_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, newtonIter, newtonConverge);
            String priorFile = "./data/Features/" + articleType + "TopicWord.txt";
            model.LoadPrior(priorFile, eta);
        } else if (topicmodel.equals("LDA_Gibbs_test")) {
            // number_of_topics = 15;
            // in gibbs sampling, no need to compute
            // log-likelihood during sampling
            model = new LDA_Gibbs_test(gibbs_iteration, 0, beta, c, lambda, number_of_topics, alpha, burnIn, gibbs_lag);
        } else if (topicmodel.equals("DCMCorrLDA_multi_E_test")) {
            converge = 1e-2;
            int newtonIter = 30;
            double newtonConverge = 1e-2;
            gibbs_iteration = 40;
            gibbs_lag = 10;
            double ksi = 800;
            double tau = 0.7;
            double alphaC = 0.001;
            model = new DCMCorrLDA_multi_E_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, alphaC, burnIn, ksi, tau, gibbs_lag, newtonIter, newtonConverge);
            String priorFile = "./data/Features/" + articleType + "TopWords.txt";
            model.LoadPrior(priorFile, eta);
        } else if (topicmodel.equals("LDAGibbs4AC_test")) {
            double ksi = 800;
            double tau = 0.7;
            model = new LDAGibbs4AC_test(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, ksi, tau);
        } else if (topicmodel.equals("DCMLDA4AC_test")) {
            // number_of_topics = 5;
            converge = 1e-3;
            double ksi = 800;
            double tau = 0.7;
            int newtonIter = 1000;
            double newtonConverge = 1e-3;
            model = new DCMLDA4AC_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, ksi, tau, newtonIter, newtonConverge);
        } else if (topicmodel.equals("DCMCorrLDA_test")) {
            // number_of_topics = 15;
            converge = 1e-3;
            int newtonIter = 50;
            double newtonConverge = 1e-3;
            double ksi = 800;
            double tau = 0.7;
            double alphaC = 0.001;
            model = new DCMCorrLDA_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, alphaC, burnIn, ksi, tau, gibbs_lag, newtonIter, newtonConverge);
        } else if (topicmodel.equals("sparseDCMLDA_test")) {
            converge = 1e-3;
            int newtonIter = 50;
            double newtonConverge = 1e-3;
            // number_of_topics = 15;
            double tParam = 1;
            double sParam = 1;
            model = new sparseDCMLDA_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, newtonIter, newtonConverge, tParam, sParam);
        /*
				 * String priorFile = "./data/Features/" + articleType +
				 * "TopicWord.txt"; model.LoadPrior(priorFile, eta);
				 */
        } else if (topicmodel.equals("sparseLDA_test")) {
            converge = 1e-3;
            // number_of_topics = 15;
            double tParam = 1;
            double sParam = 1;
            model = new sparseLDA_test(gibbs_iteration, 0, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, tParam, sParam);
        } else if (topicmodel.equals("sparseClusterDCMLDA_test")) {
            converge = 1e-3;
            int newtonIter = 50;
            double newtonConverge = 1e-3;
            double tParam = 1;
            double sParam = 1;
            double gammaParam = 0.01;
            int clusterNum = 1;
            model = new sparseClusterDCMLDA_test(gibbs_iteration, converge, beta - 1, c, lambda, number_of_topics, alpha - 1, burnIn, gibbs_lag, newtonIter, newtonConverge, tParam, sParam, clusterNum, gammaParam);
        /*
				 * String priorFile = "./data/Features/" + articleType +
				 * "TopicWord.txt"; model.LoadPrior(priorFile, eta);
				 */
        } else if (topicmodel.equals("weightedCorrespondenceModel_test")) {
            beta = beta - 1;
            alpha = alpha - 1;
            // number_of_iteration = 2;
            double lbfgsConverge = varConverge;
            converge = 1e-3;
            model = new weightedCorrespondenceModel_test(number_of_iteration, converge, beta, c, lambda, number_of_topics, alpha, varIter, varConverge, lbfgsConverge);
        // 
        // String priorFile = "./data/Features/" + articleType + "TopicWord.txt";
        // model.LoadPrior(priorFile, eta);eta
        }
        model.setDisplayLap(displayLap);
        model.setInforWriter(infoFilePath);
        if (loadAspectSentiPrior == 1) {
            System.out.println("Loading aspect-senti list from " + aspectSentiList);
            model.setSentiAspectPrior(true);
            model.LoadPrior(aspectSentiList, eta);
        } else if (loadAspectSentiPrior == 2) {
            System.out.println("Loading aspect list from " + aspectList);
            model.setSentiAspectPrior(false);
            model.LoadPrior(aspectList, eta);
        } else {
            System.out.println("No prior is added!!");
        }
        if (crossV <= 1) {
            model.EMonCorpus();
            if (topWordPath == null)
                model.printTopWords(topK);
            else
                model.printTopWords(topK, topWordPath);
        } else {
            model.setRandomFold(setRandomFold);
            double trainProportion = 0.8;
            double testProportion = 1 - trainProportion;
            model.setPerplexityProportion(testProportion);
            model.crossValidation(crossV);
            model.printTopWords(topK, topWordPath);
        }
        model.closeWriter();
        if (sentence) {
            String summaryFilePath = "./data/results/Topics_" + number_of_topics + "_Summary.txt";
            model.setSummaryWriter(summaryFilePath);
            if (category.equalsIgnoreCase("camera"))
                ((HTMM) model).docSummary(cameraProductList);
            else if (category.equalsIgnoreCase("tablet"))
                ((HTMM) model).docSummary(tabletProductList);
            else if (category.equalsIgnoreCase("phone"))
                ((HTMM) model).docSummary(phoneProductList);
            else if (category.equalsIgnoreCase("tv"))
                ((HTMM) model).docSummary(tvProductList);
        }
    }
}
Also used : topicmodels.correspondenceModels.weightedCorrespondenceModel_test(topicmodels.correspondenceModels.weightedCorrespondenceModel_test) LRHTSM(topicmodels.markovmodel.LRHTSM) HTSM(topicmodels.markovmodel.HTSM) LDA_Gibbs_test(topicmodels.LDA.LDA_Gibbs_test) topicmodels.correspondenceModels.corrLDA_Gibbs_test(topicmodels.correspondenceModels.corrLDA_Gibbs_test) topicmodels.correspondenceModels.corrLDA_Gibbs(topicmodels.correspondenceModels.corrLDA_Gibbs) topicmodels.twoTopic(topicmodels.twoTopic) ACCTM_CZLR(topicmodels.correspondenceModels.ACCTM_CZLR) HTMM(topicmodels.markovmodel.HTMM) LRHTMM(topicmodels.markovmodel.LRHTMM) structures._Corpus(structures._Corpus) DCMLDA4AC_test(topicmodels.correspondenceModels.DCMLDA4AC_test) DCM.sparseClusterDCMLDA_test(topicmodels.DCM.sparseClusterDCMLDA_test) LDA_Variational_multithread(topicmodels.multithreads.LDA.LDA_Variational_multithread) LDAGibbs4AC_test(topicmodels.correspondenceModels.LDAGibbs4AC_test) topicmodels.multithreads.pLSA.pLSA_multithread(topicmodels.multithreads.pLSA.pLSA_multithread) DCMLDA_test(topicmodels.DCM.DCMLDA_test) DCM.sparseDCMLDA_test(topicmodels.DCM.sparseDCMLDA_test) DCM.sparseClusterDCMLDA_test(topicmodels.DCM.sparseClusterDCMLDA_test) topicmodels.pLSA.pLSA(topicmodels.pLSA.pLSA) LDA.sparseLDA_test(topicmodels.LDA.sparseLDA_test) PrintStream(java.io.PrintStream) LRHTMM(topicmodels.markovmodel.LRHTMM) ACCTM(topicmodels.correspondenceModels.ACCTM) ACCTM_CHard(topicmodels.correspondenceModels.ACCTM_CHard) DCMCorrLDA_test(topicmodels.correspondenceModels.DCMCorrLDA_test) Date(java.util.Date) DCMCorrLDA_multi_E_test(topicmodels.correspondenceModels.DCMCorrLDA_multi_E_test) structures._Doc(structures._Doc) DCM.sparseDCMLDA_test(topicmodels.DCM.sparseDCMLDA_test) FileOutputStream(java.io.FileOutputStream) topicmodels.correspondenceModels.corrLDA_Gibbs(topicmodels.correspondenceModels.corrLDA_Gibbs) LDA_Gibbs(topicmodels.LDA.LDA_Gibbs) LRHTSM(topicmodels.markovmodel.LRHTSM) ParentChildAnalyzer(Analyzer.ParentChildAnalyzer) ACCTM_CZ(topicmodels.correspondenceModels.ACCTM_CZ) File(java.io.File) SimpleDateFormat(java.text.SimpleDateFormat) ACCTM_C(topicmodels.correspondenceModels.ACCTM_C)

Example 13 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class VectorReviewMain method main.

public static void main(String[] args) throws IOException, ParseException {
    /**
     ***Set these parameters before run the classifiers.****
     */
    // Define the number of classes in this Naive Bayes.
    int classNumber = 5;
    // Document length threshold
    int lengthThreshold = 5;
    // k fold-cross validation
    int CVFold = 10;
    // Supervised classification models: "NB", "LR", "PR-LR", "SVM"
    // Semi-supervised classification models: "GF", "GF-RW", "GF-RW-ML"
    // Which classifier to use.
    String classifier = "GF-RW-ML";
    // String modelPath = "./data/Model/";
    double C = 1.0;
    // "SUP", "SEMI"
    String style = "SEMI";
    String multipleLearner = "SVM";
    /**
     ***The parameters used in loading files.****
     */
    String featureLocation = "data/Features/fv_2gram_BM25_CHI_small.txt";
    String vctfile = "data/FVs/vct_2gram_BM25_CHI_tablet_small.dat";
    // String featureLocation = "data/Features/fv_fake.txt";
    // String vctfile = "data/Fvs/LinearRegression.dat";
    /**
     ***Parameters in time series analysis.****
     */
    // String debugOutput = String.format("data/debug/%s.sim.pair", classifier);
    String debugOutput = null;
    /**
     **Pre-process the data.****
     */
    // Feture selection.
    System.out.println("Loading vectors from file, wait...");
    VctAnalyzer analyzer = new VctAnalyzer(classNumber, lengthThreshold, featureLocation);
    // Load all the documents as the data set.
    analyzer.LoadDoc(vctfile);
    _Corpus corpus = analyzer.getCorpus();
    // make it binary
    corpus.mapLabels(4);
    /**
     ******Choose different classification methods.********
     */
    if (style.equals("SUP")) {
        if (classifier.equals("NB")) {
            // Define a new naive bayes with the parameters.
            System.out.println("Start naive bayes, wait...");
            NaiveBayes myNB = new NaiveBayes(corpus);
            // Use the movie reviews for testing the codes.
            myNB.crossValidation(CVFold, corpus);
        } else if (classifier.equals("KNN")) {
            // Define a new naive bayes with the parameters.
            System.out.println("Start kNN, wait...");
            KNN myKNN = new KNN(corpus, 10, 1);
            // Use the movie reviews for testing the codes.
            myKNN.crossValidation(CVFold, corpus);
        } else if (classifier.equals("LR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start logistic regression, wait...");
            LogisticRegression myLR = new LogisticRegression(corpus, C);
            myLR.setDebugOutput(debugOutput);
            // Use the movie reviews for testing the codes.
            myLR.crossValidation(CVFold, corpus);
        // myLR.saveModel(modelPath + "LR.model");
        } else if (classifier.equals("PRLR")) {
            // Define a new logistics regression with the parameters.
            System.out.println("Start posterior regularized logistic regression, wait...");
            PRLogisticRegression myLR = new PRLogisticRegression(corpus, C);
            myLR.setDebugOutput(debugOutput);
            // Use the movie reviews for testing the codes.
            myLR.crossValidation(CVFold, corpus);
        // myLR.saveModel(modelPath + "LR.model");
        } else if (classifier.equals("SVM")) {
            System.out.println("Start SVM, wait...");
            SVM mySVM = new SVM(corpus, C);
            mySVM.crossValidation(CVFold, corpus);
        } else if (classifier.equals("PR")) {
            System.out.println("Start PageRank, wait...");
            PageRank myPR = new PageRank(corpus, C, 100, 50, 1e-6);
            myPR.train(corpus.getCollection());
        } else
            System.out.println("Classifier has not been developed yet!");
    } else if (style.equals("SEMI")) {
        double learningRatio = 1.0;
        // k nearest labeled, k' nearest unlabeled
        int k = 20, kPrime = 20;
        // labeled data weight, unlabeled data weight
        double tAlpha = 1.0, tBeta = 0.1;
        // convergence of random walk, weight of random walk
        double tDelta = 1e-4, tEta = 0.5;
        boolean simFlag = false;
        double threshold = 0.5;
        // bound for generating rating constraints (must be zero in binary case)
        int bound = 0;
        boolean metricLearning = true;
        if (classifier.equals("GF")) {
            GaussianFields mySemi = new GaussianFields(corpus, multipleLearner, C);
            mySemi.crossValidation(CVFold, corpus);
        } else if (classifier.equals("GF-RW")) {
            GaussianFields mySemi = new GaussianFieldsByRandomWalk(corpus, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, false);
            mySemi.setDebugOutput(debugOutput);
            mySemi.crossValidation(CVFold, corpus);
        } else if (classifier.equals("GF-RW-ML")) {
            LinearSVMMetricLearning lMetricLearner = new LinearSVMMetricLearning(corpus, multipleLearner, C, learningRatio, k, kPrime, tAlpha, tBeta, tDelta, tEta, false, bound);
            lMetricLearner.setMetricLearningMethod(metricLearning);
            lMetricLearner.setDebugOutput(debugOutput);
            lMetricLearner.crossValidation(CVFold, corpus);
        } else
            System.out.println("Classifier has not been developed yet!");
    } else
        System.out.println("Learning paradigm has not been developed yet!");
}
Also used : PRLogisticRegression(Classifier.supervised.PRLogisticRegression) GaussianFieldsByRandomWalk(Classifier.semisupervised.GaussianFieldsByRandomWalk) PageRank(influence.PageRank) VctAnalyzer(Analyzer.VctAnalyzer) KNN(Classifier.supervised.KNN) SVM(Classifier.supervised.SVM) structures._Corpus(structures._Corpus) NaiveBayes(Classifier.supervised.NaiveBayes) LinearSVMMetricLearning(Classifier.metricLearning.LinearSVMMetricLearning) GaussianFields(Classifier.semisupervised.GaussianFields) PRLogisticRegression(Classifier.supervised.PRLogisticRegression) LogisticRegression(Classifier.supervised.LogisticRegression)

Example 14 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class featureGeneration method main.

public static void main(String[] args) throws IOException, ParseException {
    // Define the number of classes in this Naive Bayes.
    int classNumber = 5;
    // The default value is unigram.
    int Ngram = 1;
    // The way of calculating the feature value, which can also be "TFIDF", "BM25"
    String featureValue = "TF";
    // The way of normalization.(only 1 and 2)
    int norm = 0;
    // Document length threshold
    int lengthThreshold = 5;
    // each document should have at least 2 sentences
    int minimunNumberofSentence = 2;
    /**
     ***parameters for the two-topic topic model****
     */
    // 2topic, pLSA, HTMM, LRHTMM, Tensor, LDA_Gibbs, LDA_Variational, HTSM, LRHTSM, ParentChild_Gibbs
    String topicmodel = "featureGeneration";
    String category = "tablet";
    int number_of_topics = 20;
    // false means in training there is no reviews from NewEgg
    boolean loadNewEggInTrain = true;
    // false means no shuffling and true means shuffling
    boolean setRandomFold = true;
    // 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
    int loadAspectSentiPrior = 0;
    // these two parameters must be larger than 1!!!
    double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
    // negative converge means do not need to check likelihood convergency
    double converge = -1e-9, lambda = 0.9;
    int varIter = 10;
    double varConverge = 1e-5;
    int topK = 20, number_of_iteration = 50, crossV = 10;
    int gibbs_iteration = 2000, gibbs_lag = 50;
    // gibbs_iteration = 10;
    // gibbs_lag = 2;
    double burnIn = 0.4;
    boolean display = true, sentence = false;
    // most popular items under each category from Amazon
    // needed for docSummary
    String[] tabletProductList = { "B008GFRDL0" };
    String[] cameraProductList = { "B005IHAIMA" };
    String[] phoneProductList = { "B00COYOAYW" };
    String[] tvProductList = { "B0074FGLUM" };
    /**
     ***The parameters used in loading files.****
     */
    String amazonFolder = "./data/amazon/tablet/topicmodel";
    String newEggFolder = "./data/NewEgg";
    String articleType = "Tech";
    // articleType = "Yahoo";
    // articleType = "Gadgets";
    // articleType = "APP";
    String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
    String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
    // articleFolder = "../../Code/Data/TextMiningProject/APPDescriptions";
    // commentFolder = "../../Code/Data/TextMiningProject/APPReviews";
    String suffix = ".json";
    // Token model.
    String tokenModel = "./data/Model/en-token.bin";
    String stnModel = null;
    String posModel = null;
    if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
        // Sentence model.
        stnModel = "./data/Model/en-sent.bin";
        // POS model.
        posModel = "./data/Model/en-pos-maxent.bin";
        sentence = true;
    }
    String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s_sample.txt", Ngram, articleType);
    // String fvFile = String.format("./data/Features/fv_%dgram_topicmodel.txt", Ngram);
    String fvStatFile = String.format("./data/Features/fv_%dgram_stat_%s_%s.txt", Ngram, articleType, topicmodel);
    String aspectList = "./data/Model/aspect_" + category + ".txt";
    String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
    String pathToPosWords = "./data/Model/SentiWordsPos.txt";
    String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
    String pathToNegationWords = "./data/Model/negation_words.txt";
    String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
    File rootFolder = new File("./data/results");
    if (!rootFolder.exists()) {
        System.out.println("creating root directory" + rootFolder);
        rootFolder.mkdir();
    }
    Calendar today = Calendar.getInstance();
    String filePrefix = String.format("./data/results/%s-%s-%s%s-%s", 1 + today.get(Calendar.MONTH), today.get(Calendar.DAY_OF_MONTH), today.get(Calendar.HOUR_OF_DAY), today.get(Calendar.MINUTE), topicmodel);
    File resultFolder = new File(filePrefix);
    if (!resultFolder.exists()) {
        System.out.println("creating directory" + resultFolder);
        resultFolder.mkdir();
    }
    String infoFilePath = filePrefix + "/Information.txt";
    // //store top k words distribution over topic
    String topWordPath = filePrefix + "/topWords.txt";
    /**
     ***Parameters in feature selection.****
     */
    String stopwords = "./data/Model/stopwords.dat";
    // Feature selection method.
    String featureSelection = "DF";
    // Used in feature selection, the starting point of the features.
    double startProb = 0.50;
    // Used in feature selection, the ending point of
    double endProb = 1;
    // the features.
    // Filter the features with DFs smaller than this
    int maxDF = -1, minDF = 10;
    // threshold.
    double DFUpperThreshold = 0.05;
    System.out.println("Performing feature selection, wait...");
    ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
    analyzer.LoadStopwords(stopwords);
    // analyzer.LoadParentDirectory(articleFolder, suffix);
    // analyzer.LoadChildDirectory(commentFolder, suffix);
    analyzer.LoadDirectory(articleFolder, suffix);
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
    // analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
    // Select the features.
    analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, maxDF, minDF);
    System.out.println("Creating feature vectors, wait...");
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
    // newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
    /**
     *** parent child topic model ****
     */
    // analyzer.setFeatureValues(featureValue, norm);
    // Get the collection of all the documents.
    _Corpus c = analyzer.returnCorpus(fvStatFile);
}
Also used : structures._Corpus(structures._Corpus) Calendar(java.util.Calendar) ParentChildAnalyzer(Analyzer.ParentChildAnalyzer) File(java.io.File)

Aggregations

structures._Corpus (structures._Corpus)11 File (java.io.File)7 ParentChildAnalyzer (Analyzer.ParentChildAnalyzer)5 SVM (Classifier.supervised.SVM)5 structures._Doc (structures._Doc)5 LogisticRegression (Classifier.supervised.LogisticRegression)4 NaiveBayes (Classifier.supervised.NaiveBayes)4 DocAnalyzer (Analyzer.DocAnalyzer)3 LinearSVMMetricLearning (Classifier.metricLearning.LinearSVMMetricLearning)3 GaussianFields (Classifier.semisupervised.GaussianFields)3 GaussianFieldsByRandomWalk (Classifier.semisupervised.GaussianFieldsByRandomWalk)3 PageRank (influence.PageRank)3 FileOutputStream (java.io.FileOutputStream)3 ArrayList (java.util.ArrayList)3 Calendar (java.util.Calendar)3 LDA_Gibbs (topicmodels.LDA.LDA_Gibbs)3 LDA_Variational_multithread (topicmodels.multithreads.LDA.LDA_Variational_multithread)3 topicmodels.multithreads.pLSA.pLSA_multithread (topicmodels.multithreads.pLSA.pLSA_multithread)3 topicmodels.pLSA.pLSA (topicmodels.pLSA.pLSA)3 VctAnalyzer (Analyzer.VctAnalyzer)2