Examples with structures._Corpus

Example 1 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class BurstinessMain method main.

public static void main(String[] args) throws IOException, ParseException {
    int mb = 1024 * 1024;
    Runtime rTime = Runtime.getRuntime();
    System.out.println("totalMem\t:" + rTime.totalMemory() / mb);
    // Define the number of classes in this Naive Bayes.
    int classNumber = 5;
    // The default value is unigram.
    int Ngram = 1;
    // The way of calculating the feature value, which can also be "TFIDF", "BM25"
    String featureValue = "TF";
    // The way of normalization.(only 1 and 2)
    int norm = 0;
    // Document length threshold
    int lengthThreshold = 5;
    // each document should have at least 2 sentences
    int minimunNumberofSentence = 2;
    /**
     ***parameters for the two-topic topic model****
     */
    // ACCTM, ACCTM_TwoTheta, ACCTM_C, ACCTM_CZ, ACCTM_CZLR, LDAonArticles, ACCTM_C,
    // correspondence_LDA_Gibbs, LDA_Gibbs_Debug, LDA_Variational_multithread
    // 2topic, pLSA, HTMM, LRHTMM, Tensor, LDA_Gibbs, LDA_Variational, HTSM, LRHTSM,
    String topicmodel = "Burstiness";
    String category = "tablet";
    int number_of_topics = 30;
    // false means in training there is no reviews from NewEgg
    boolean loadNewEggInTrain = true;
    // false means no shuffling and true means shuffling
    boolean setRandomFold = true;
    // 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
    int loadAspectSentiPrior = 0;
    // these two parameters must be larger than 1!!!
    double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
    // negative converge means do not need to check likelihood convergency
    double converge = 1e-9, lambda = 0.9;
    int varIter = 10;
    double varConverge = 1e-5;
    int topK = 20, number_of_iteration = 50, crossV = 1;
    int gibbs_iteration = 500, gibbs_lag = 50;
    int displayLap = 50;
    gibbs_iteration = 4;
    gibbs_lag = 2;
    displayLap = 2;
    double burnIn = 0.4;
    boolean sentence = false;
    // most popular items under each category from Amazon
    // needed for docSummary
    String[] tabletProductList = { "B008GFRDL0" };
    String[] cameraProductList = { "B005IHAIMA" };
    String[] phoneProductList = { "B00COYOAYW" };
    String[] tvProductList = { "B0074FGLUM" };
    /**
     ***The parameters used in loading files.****
     */
    String amazonFolder = "./data/amazon/tablet/topicmodel";
    String newEggFolder = "./data/NewEgg";
    String articleType = "Tech";
    // articleType = "Gadgets";
    // articleType = "Yahoo";
    // articleType = "APP";
    String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
    String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
    String suffix = ".json";
    // Token model.
    String tokenModel = "./data/Model/en-token.bin";
    String stnModel = null;
    String posModel = null;
    if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
        // Sentence model.
        stnModel = "./data/Model/en-sent.bin";
        // POS model.
        posModel = "./data/Model/en-pos-maxent.bin";
        sentence = true;
    }
    String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s.txt", Ngram, articleType);
    String fvStatFile = String.format("./data/Features/fv_%dgram_stat_%s_%s.txt", Ngram, articleType, topicmodel);
    String aspectList = "./data/Model/aspect_" + category + ".txt";
    String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
    String pathToPosWords = "./data/Model/SentiWordsPos.txt";
    String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
    String pathToNegationWords = "./data/Model/negation_words.txt";
    String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
    File rootFolder = new File("./data/results");
    if (!rootFolder.exists()) {
        System.out.println("creating root directory" + rootFolder);
        rootFolder.mkdir();
    }
    SimpleDateFormat dateFormatter = new SimpleDateFormat("yyyyMMdd-HHmm");
    String filePrefix = String.format("./data/results/%s", dateFormatter.format(new Date()));
    filePrefix = filePrefix + "-" + topicmodel + "-" + articleType;
    File resultFolder = new File(filePrefix);
    if (!resultFolder.exists()) {
        System.out.println("creating directory" + resultFolder);
        resultFolder.mkdir();
    }
    String outputFile = filePrefix + "/consoleOutput.txt";
    PrintStream printStream = new PrintStream(new FileOutputStream(outputFile));
    System.setOut(printStream);
    String infoFilePath = filePrefix + "/Information.txt";
    // //store top k words distribution over topic
    String topWordPath = filePrefix + "/topWords.txt";
    /**
     ***Parameters in feature selection.****
     */
    String stopwords = "./data/Model/stopwords.dat";
    // Feature selection method.
    String featureSelection = "DF";
    // Used in feature selection, the starting point of the features.
    double startProb = 0.;
    // Used in feature selection, the ending point of the features.
    double endProb = 0.999;
    // Filter the features with DFs smaller than this threshold.
    int DFthreshold = 5;
    System.out.println("Performing feature selection, wait...");
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
    // newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
    // analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
    /**
     *** parent child topic model ****
     */
    ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
    // analyzer.LoadDirectory(commentFolder, suffix);
    if (topicmodel.equals("LDA_APPMerged"))
        articleFolder = String.format("./data/ParentChildTopicModel/%sDescriptionsReviews", articleType);
    // articleFolder = String.format(
    // "./data/ParentChildTopicModel/%sArticles4Merged",
    // articleType);
    // 
    // commentFolder = String.format(
    // "./data/ParentChildTopicModel/%sComments4Merged",
    // articleType);
    // 
    analyzer.LoadParentDirectory(articleFolder, suffix);
    // analyzer.LoadDirectory(articleFolder, suffix);
    // analyzer.LoadDirectory(commentFolder, suffix);
    analyzer.LoadChildDirectory(commentFolder, suffix);
    // if((topicmodel."LDA_APP")&&(topicmodel!="LDA_APPMerged"))
    // analyzer.LoadChildDirectory(commentFolder, suffix);
    // analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
    System.out.println("Creating feature vectors, wait...");
    // analyzer.LoadNewEggDirectory(newEggFolder, suffix); //Load all the documents as the data set.
    // analyzer.LoadDirectory(amazonFolder, suffix);
    analyzer.setFeatureValues(featureValue, norm);
    // Get the collection of all the documents.
    _Corpus c = analyzer.returnCorpus(fvStatFile);
// _Corpus c = analyzer.getCorpus();
// analyzer.generateFakeCorpus(filePrefix);
// analyzer.analyzeBurstiness(filePrefix);
}

Also used : PrintStream(java.io.PrintStream) Date(java.util.Date) structures._Corpus(structures._Corpus) FileOutputStream(java.io.FileOutputStream) ParentChildAnalyzer(Analyzer.ParentChildAnalyzer) File(java.io.File) SimpleDateFormat(java.text.SimpleDateFormat)

Example 2 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class languageModelBaseLine method main.

public static void main(String[] args) throws IOException, ParseException {
    // Define the number of classes in this Naive Bayes.
    int classNumber = 5;
    // The default value is unigram.
    int Ngram = 1;
    // The way of calculating the feature value, which can also be "TFIDF",
    // "BM25"
    String featureValue = "BM25";
    // The way of normalization.(only 1 and 2)
    int norm = 0;
    // Document length threshold
    int lengthThreshold = 5;
    // each document should have at least 2 sentences
    int minimunNumberofSentence = 2;
    /**
     ***parameters for the two-topic topic model****
     */
    // 2topic, pLSA, HTMM, LRHTMM, Tensor, LDA_Gibbs, LDA_Variational, HTSM, LRHTSM, ParentChild_Gibbs
    String topicmodel = "languageModel";
    String category = "tablet";
    int number_of_topics = 20;
    // false means in training there is no reviews from NewEgg
    boolean loadNewEggInTrain = true;
    // false means no shuffling and true means shuffling
    boolean setRandomFold = false;
    // 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
    int loadAspectSentiPrior = 0;
    // these two parameters must be larger than 1!!!
    double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
    // negative converge means do not need to check likelihood convergency
    double converge = 1e-9, lambda = 0.9;
    int varIter = 10;
    double varConverge = 1e-5;
    int topK = 20, number_of_iteration = 50, crossV = 1;
    int gibbs_iteration = 2000, gibbs_lag = 50;
    gibbs_iteration = 4;
    gibbs_lag = 2;
    double burnIn = 0.4;
    boolean display = true, sentence = false;
    // most popular items under each category from Amazon
    // needed for docSummary
    String[] tabletProductList = { "B008GFRDL0" };
    String[] cameraProductList = { "B005IHAIMA" };
    String[] phoneProductList = { "B00COYOAYW" };
    String[] tvProductList = { "B0074FGLUM" };
    /**
     ***The parameters used in loading files.****
     */
    String amazonFolder = "./data/amazon/tablet/topicmodel";
    String newEggFolder = "./data/NewEgg";
    String articleType = "Tech";
    // articleType = "GadgetsArticles";
    String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
    String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
    String suffix = ".json";
    // Token model.
    String tokenModel = "./data/Model/en-token.bin";
    String stnModel = null;
    String posModel = null;
    if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
        // Sentence model.
        stnModel = "./data/Model/en-sent.bin";
        // POS model.
        posModel = "./data/Model/en-pos-maxent.bin";
        sentence = true;
    }
    String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s.txt", Ngram, articleType);
    // String fvFile = String.format("./data/Features/fv_%dgram_topicmodel.txt", Ngram);
    String fvStatFile = String.format("./data/Features/fv_%dgram_stat_topicmodel.txt", Ngram);
    String aspectList = "./data/Model/aspect_" + category + ".txt";
    String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
    String pathToPosWords = "./data/Model/SentiWordsPos.txt";
    String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
    String pathToNegationWords = "./data/Model/negation_words.txt";
    String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
    File rootFolder = new File("./data/results");
    if (!rootFolder.exists()) {
        System.out.println("creating root directory" + rootFolder);
        rootFolder.mkdir();
    }
    Calendar today = Calendar.getInstance();
    String filePrefix = String.format("./data/results/%s-%s-%s%s-%s", today.get(Calendar.MONTH), today.get(Calendar.DAY_OF_MONTH), today.get(Calendar.HOUR_OF_DAY), today.get(Calendar.MINUTE), topicmodel);
    File resultFolder = new File(filePrefix);
    if (!resultFolder.exists()) {
        System.out.println("creating directory" + resultFolder);
        resultFolder.mkdir();
    }
    String infoFilePath = filePrefix + "/Information.txt";
    // //store top k words distribution over topic
    String topWordPath = filePrefix + "/topWords.txt";
    /**
     ***Parameters in feature selection.****
     */
    String stopwords = "./data/Model/stopwords.dat";
    // Feature selection method.
    String featureSelection = "DF";
    // Used in feature selection, the starting point of the features.
    double startProb = 0.5;
    // Used in feature selection, the ending point of the features.
    double endProb = 0.999;
    // Filter the features with DFs smaller than this threshold.
    int DFthreshold = 30;
    // System.out.println("Performing feature selection, wait...");
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
    // analyzer.LoadStopwords(stopwords);
    // analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
    // analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
    System.out.println("Creating feature vectors, wait...");
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
    // newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
    /**
     *** parent child topic model ****
     */
    ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
    analyzer.LoadParentDirectory(articleFolder, suffix);
    analyzer.LoadChildDirectory(commentFolder, suffix);
    // analyzer.LoadNewEggDirectory(newEggFolder, suffix); //Load all the documents as the data set.
    // analyzer.LoadDirectory(amazonFolder, suffix);
    // analyzer.setFeatureValues(featureValue, norm);
    // Get the collection of all the documents.
    _Corpus c = analyzer.returnCorpus(fvStatFile);
    double mu = 800;
    languageModelBaseLine lm = new languageModelBaseLine(c, mu);
    lm.generateReferenceModel();
    lm.printTopChild4Stn(filePrefix);
// bm25Corr.rankChild4Stn(c, TopChild4StnFile);
// bm25Corr.rankStn4Child(c, TopStn4ChildFile);
// bm25Corr.rankChild4Parent(c, TopChild4ParentFile);
// bm25Corr.discoverSpecificComments(c, similarityFile);
// String DFFile = filePrefix+"/df.txt";
// bm25Corr.outputDF(c, DFFile);
}

Also used : structures._Corpus(structures._Corpus) Calendar(java.util.Calendar) ParentChildAnalyzer(Analyzer.ParentChildAnalyzer) File(java.io.File)

Example 3 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class outputFile method outputFiles.

public static void outputFiles(String filePrefix, _Corpus c) {
    try {
        String selectedSentencesinParentFile = filePrefix + "/selected_Stn.txt";
        String selectedCommentsFile = filePrefix + "/selected_Comments.txt";
        String sctmFormatParentFile = filePrefix + "/abagf.AT.txt";
        String sctmFormatChildFile = filePrefix + "/cbagf.AT.txt";
        String sctmWordFile = filePrefix + "/words.AT.txt";
        String stnLengthFile = filePrefix + "/selected_StnLength.txt";
        String shortStnFile = filePrefix + "/selected_ShortStn.txt";
        String longStnFile = filePrefix + "/selected_LongStn.txt";
        if (c.getFeatureSize() != 0) {
            PrintWriter wordPW = new PrintWriter(new File(sctmWordFile));
            for (int i = 0; i < c.getFeatureSize(); i++) {
                String wordName = c.getFeature(i);
                wordPW.println(wordName);
            }
            wordPW.flush();
            wordPW.close();
        }
        PrintWriter stnLengthPW = new PrintWriter(new File(stnLengthFile));
        PrintWriter shortParentPW = new PrintWriter(new File(shortStnFile));
        PrintWriter longParentPW = new PrintWriter(new File(longStnFile));
        PrintWriter parentPW = new PrintWriter(new File(selectedSentencesinParentFile));
        PrintWriter childPW = new PrintWriter(new File(selectedCommentsFile));
        PrintWriter sctmParentPW = new PrintWriter(new File(sctmFormatParentFile));
        PrintWriter sctmChildPW = new PrintWriter(new File(sctmFormatChildFile));
        int totoalParentNum = 0;
        TreeMap<Integer, _ParentDoc> parentMap = new TreeMap<Integer, _ParentDoc>();
        int totalStnNum = 0;
        ArrayList<_Doc> m_trainSet = c.getCollection();
        ArrayList<Integer> parentNameList = new ArrayList<Integer>();
        for (_Doc d : m_trainSet) {
            if (d instanceof _ParentDoc) {
                // HashMap<Integer, _Stn> stnMap = ((_ParentDoc)
                // d).m_sentenceMap;
                totoalParentNum += 1;
                String parentName = d.getName();
                parentMap.put(Integer.parseInt(parentName), (_ParentDoc) d);
                parentNameList.add(Integer.parseInt(parentName));
            }
        }
        ArrayList<Double> parentDocLenList = new ArrayList<Double>();
        ArrayList<Double> childDocLenList = new ArrayList<Double>();
        double parentDocLenSum = 0;
        double childDocLenSum = 0;
        for (int parentID : parentMap.keySet()) {
            _ParentDoc parentObj = parentMap.get(parentID);
            double parentDocLen = parentObj.getTotalDocLength();
            parentDocLenSum += parentDocLen;
            parentDocLenList.add(parentDocLen);
            for (_ChildDoc cDoc : parentObj.m_childDocs) {
                double childDocLen = cDoc.getTotalDocLength();
                childDocLenList.add(childDocLen);
                childDocLenSum += childDocLen;
            }
            _Stn[] sentenceArray = parentObj.getSentences();
            int selectedStn = 0;
            for (int i = 0; i < sentenceArray.length; i++) {
                _Stn stnObj = sentenceArray[i];
                if (stnObj == null)
                    continue;
                selectedStn += 1;
                stnLengthPW.println(stnObj.getLength());
            // if(stnObj==null)
            // continue;
            // selectedStn += 1;
            }
            totalStnNum += selectedStn;
            parentPW.print(parentID + "\t" + selectedStn + "\t");
            shortParentPW.print(parentID + "\t");
            longParentPW.print(parentID + "\t");
            for (int i = 0; i < sentenceArray.length; i++) {
                _Stn stnObj = sentenceArray[i];
                if (stnObj == null)
                    continue;
                if (stnObj.getLength() < 15)
                    shortParentPW.print((stnObj.getIndex() + 1) + "\t");
                else
                    longParentPW.print((stnObj.getIndex() + 1) + "\t");
                parentPW.print((stnObj.getIndex() + 1) + "\t");
            }
            parentPW.println();
            longParentPW.println();
            shortParentPW.println();
        }
        System.out.println("longest child\t" + Collections.max(childDocLenList));
        System.out.println("shortest child\t" + Collections.min(childDocLenList));
        System.out.println("parent doc len\t" + parentDocLenSum / parentDocLenList.size());
        System.out.println("child doc len\t" + childDocLenSum / childDocLenList.size());
        parentPW.flush();
        parentPW.close();
        stnLengthPW.flush();
        stnLengthPW.close();
        shortParentPW.flush();
        shortParentPW.close();
        longParentPW.flush();
        longParentPW.close();
        sctmParentPW.println(totoalParentNum);
        sctmChildPW.println(totoalParentNum);
        System.out.println("stnNum" + totalStnNum);
        for (int parentID : parentMap.keySet()) {
            _ParentDoc d = parentMap.get(parentID);
            // HashMap<Integer, _Stn> stnMap = ((_ParentDoc)
            // d).m_sentenceMap;
            _Stn[] sentenceArray = (d).getSentences();
            int selectedStn = 0;
            for (int i = 0; i < sentenceArray.length; i++) {
                _Stn stnObj = sentenceArray[i];
                if (stnObj == null)
                    continue;
                selectedStn += 1;
            }
            sctmParentPW.println(selectedStn);
            for (int i = 0; i < sentenceArray.length; i++) {
                _Stn stnObj = sentenceArray[i];
                if (stnObj == null)
                    continue;
                _SparseFeature[] sv = stnObj.getFv();
                sctmParentPW.print((int) stnObj.getLength() + "\t");
                for (int j = 0; j < sv.length; j++) {
                    int index = sv[j].getIndex();
                    double value = sv[j].getValue();
                    for (int v = 0; v < value; v++) sctmParentPW.print(index + "\t");
                }
                sctmParentPW.println();
            }
            ArrayList<_ChildDoc> childDocs = ((_ParentDoc) d).m_childDocs;
            sctmChildPW.println(childDocs.size());
            String parentName = d.getName();
            TreeMap<Integer, _ChildDoc> childMap = new TreeMap<Integer, _ChildDoc>();
            for (_ChildDoc cDoc : childDocs) {
                String childName = cDoc.getName();
                int childID = Integer.parseInt(childName.replace(parentName + "_", ""));
                childMap.put(childID, cDoc);
            }
            childPW.print(parentName + "\t");
            for (int t : childMap.keySet()) {
                _ChildDoc cDoc = childMap.get(t);
                sctmChildPW.print((int) cDoc.getTotalDocLength() + "\t");
                childPW.print(cDoc.getName() + "\t");
                // System.out.println(cDoc.getName() + "\t");
                _SparseFeature[] fv = cDoc.getSparse();
                for (int j = 0; j < fv.length; j++) {
                    int index = fv[j].getIndex();
                    double value = fv[j].getValue();
                    for (int v = 0; v < value; v++) {
                        sctmChildPW.print(index + "\t");
                    }
                }
                sctmChildPW.println();
            }
            childPW.println();
        }
        sctmParentPW.flush();
        sctmParentPW.close();
        sctmChildPW.flush();
        sctmChildPW.close();
        childPW.flush();
        childPW.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}

Also used : structures._Stn(structures._Stn) ArrayList(java.util.ArrayList) TreeMap(java.util.TreeMap) IOException(java.io.IOException) ParseException(java.text.ParseException) structures._ChildDoc(structures._ChildDoc) structures._Doc(structures._Doc) structures._ParentDoc(structures._ParentDoc) structures._SparseFeature(structures._SparseFeature) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 4 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class outputFile method main.

public static void main(String[] args) throws IOException, ParseException {
    // Define the number of classes in this Naive Bayes.
    int classNumber = 5;
    // The default value is unigram.
    int Ngram = 1;
    // The way of calculating the feature value, which can also be "TFIDF", "BM25"
    String featureValue = "TF";
    // The way of normalization.(only 1 and 2)
    int norm = 0;
    // Document length threshold
    int lengthThreshold = 5;
    // each document should have at least 2 sentences
    int minimunNumberofSentence = 2;
    /**
     ***parameters for the two-topic topic model****
     */
    // 2topic, pLSA, HTMM, LRHTMM, Tensor,
    String topicmodel = "wsdm";
    // LDA_Gibbs, LDA_Variational, HTSM, LRHTSM,
    // ParentChild_Gibbs
    String category = "tablet";
    int number_of_topics = 20;
    // false means in training there is no reviews from NewEgg
    boolean loadNewEggInTrain = true;
    // false means no shuffling and true means shuffling
    boolean setRandomFold = true;
    // 0 means nothing loaded as prior; 1 = load both senti and aspect; 2 means load only aspect
    int loadAspectSentiPrior = 0;
    // these two parameters must be larger than 1!!!
    double alpha = 1.0 + 1e-2, beta = 1.0 + 1e-3, eta = topicmodel.equals("LDA_Gibbs") ? 200 : 5.0;
    // negative converge means do not need to check likelihood convergency
    double converge = -1e-9, lambda = 0.9;
    int varIter = 10;
    double varConverge = 1e-5;
    int topK = 20, number_of_iteration = 50, crossV = 10;
    int gibbs_iteration = 2000, gibbs_lag = 50;
    gibbs_iteration = 10;
    gibbs_lag = 2;
    double burnIn = 0.4;
    boolean display = true, sentence = false;
    // most popular items under each category from Amazon
    // needed for docSummary
    String[] tabletProductList = { "B008GFRDL0" };
    String[] cameraProductList = { "B005IHAIMA" };
    String[] phoneProductList = { "B00COYOAYW" };
    String[] tvProductList = { "B0074FGLUM" };
    /**
     ***The parameters used in loading files.****
     */
    String amazonFolder = "./data/amazon/tablet/topicmodel";
    String newEggFolder = "./data/NewEgg";
    String articleType = "Tech";
    articleType = "Yahoo";
    // articleType = "Gadgets";
    // articleType = "APP";
    String articleFolder = String.format("./data/ParentChildTopicModel/%sArticles", articleType);
    String commentFolder = String.format("./data/ParentChildTopicModel/%sComments", articleType);
    articleFolder = String.format("../../Code/Data/TextMiningProject/APPDescriptions");
    commentFolder = String.format("../../Code/Data/TextMiningProject/APPReviews");
    String suffix = ".json";
    // Token model.
    String tokenModel = "./data/Model/en-token.bin";
    String stnModel = null;
    String posModel = null;
    if (topicmodel.equals("HTMM") || topicmodel.equals("LRHTMM") || topicmodel.equals("HTSM") || topicmodel.equals("LRHTSM")) {
        // Sentence model.
        stnModel = "./data/Model/en-sent.bin";
        // POS model.
        posModel = "./data/Model/en-pos-maxent.bin";
        sentence = true;
    }
    String fvFile = String.format("./data/Features/fv_%dgram_topicmodel_%s.txt", Ngram, articleType);
    // String fvFile = String.format("./data/Features/fv_%dgram_topicmodel.txt", Ngram);
    String fvStatFile = String.format("./data/Features/fv_%dgram_stat_topicmodel.txt", Ngram);
    String aspectList = "./data/Model/aspect_" + category + ".txt";
    String aspectSentiList = "./data/Model/aspect_sentiment_" + category + ".txt";
    String pathToPosWords = "./data/Model/SentiWordsPos.txt";
    String pathToNegWords = "./data/Model/SentiWordsNeg.txt";
    String pathToNegationWords = "./data/Model/negation_words.txt";
    String pathToSentiWordNet = "./data/Model/SentiWordNet_3.0.0_20130122.txt";
    File rootFolder = new File("./data/results");
    if (!rootFolder.exists()) {
        System.out.println("creating root directory" + rootFolder);
        rootFolder.mkdir();
    }
    Calendar today = Calendar.getInstance();
    String filePrefix = String.format("./data/results/%s-%s-%s%s-%s-%s", 1 + today.get(Calendar.MONTH), today.get(Calendar.DAY_OF_MONTH), today.get(Calendar.HOUR_OF_DAY), today.get(Calendar.MINUTE), topicmodel, articleType);
    File resultFolder = new File(filePrefix);
    if (!resultFolder.exists()) {
        System.out.println("creating directory" + resultFolder);
        resultFolder.mkdir();
    }
    String infoFilePath = filePrefix + "/Information.txt";
    // //store top k words distribution over topic
    String topWordPath = filePrefix + "/topWords.txt";
    /**
     ***Parameters in feature selection.****
     */
    String stopwords = "./data/Model/stopwords.dat";
    // Feature selection method.
    String featureSelection = "DF";
    // Used in feature selection, the starting point of the features.
    double startProb = 0.0;
    // Used in feature selection, the ending point of the features.
    double endProb = 0.95;
    // Filter the features with DFs smaller than this threshold.
    int DFthreshold = 3;
    System.out.println("Performing feature selection, wait...");
    ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
    // analyzer.LoadStopwords(stopwords);
    // analyzer.LoadParentDirectory(articleFolder, suffix);
    // analyzer.LoadChildDirectory(commentFolder, suffix);
    analyzer.LoadDirectory(commentFolder, suffix);
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, null, Ngram, lengthThreshold);
    // analyzer.LoadDirectory(folder, suffix); //Load all the documents as the data set.
    // analyzer.featureSelection(fvFile, featureSelection, startProb, endProb, DFthreshold); //Select the features.
    System.out.println("Creating feature vectors, wait...");
    // jsonAnalyzer analyzer = new jsonAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel);
    // newEggAnalyzer analyzer = new newEggAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold, stnModel, posModel, category, 2);
    /**
     *** parent child topic model ****
     */
    // ParentChildAnalyzer analyzer = new ParentChildAnalyzer(tokenModel, classNumber, fvFile, Ngram, lengthThreshold);
    // analyzer.LoadParentDirectory(TechArticlesFolder, suffix);
    // analyzer.LoadChildDirectory(TechCommentsFolder, suffix);
    // analyzer.LoadDirectory(TechArticlesFolder, suffix);
    // analyzer.LoadDirectory(TechCommentsFolder, suffix);
    // analyzer.setFeatureValues(featureValue, norm);
    // Get the collection of all the documents.
    _Corpus c = analyzer.returnCorpus(fvStatFile);
    statisticDocLen(c);
// outputFiles(filePrefix, c);
}

Also used : structures._Corpus(structures._Corpus) Calendar(java.util.Calendar) ParentChildAnalyzer(Analyzer.ParentChildAnalyzer) File(java.io.File)

Example 5 with structures._Corpus

use of structures._Corpus in project IR_Base by Linda-sunshine.

the class outputFile method statisticDocLen.

public static void statisticDocLen(_Corpus c) {
    ArrayList<Double> childDocLenList = new ArrayList<Double>();
    double childDocLenSum = 0;
    ArrayList<_Doc> m_trainSet = c.getCollection();
    for (_Doc d : m_trainSet) {
        double childDocLen = d.getTotalDocLength();
        childDocLenList.add(childDocLen);
        childDocLenSum += childDocLen;
    }
    System.out.println("longest child\t" + Collections.max(childDocLenList));
    System.out.println("shortest child\t" + Collections.min(childDocLenList));
    // System.out.println("parent doc len\t"+parentDocLenSum/parentDocLenList.size());
    System.out.println("child doc len\t" + childDocLenSum / childDocLenList.size());
}

Also used : structures._Doc(structures._Doc) ArrayList(java.util.ArrayList)

Aggregations

structures._Corpus (structures._Corpus)11 File (java.io.File)7 ParentChildAnalyzer (Analyzer.ParentChildAnalyzer)5 SVM (Classifier.supervised.SVM)5 structures._Doc (structures._Doc)5 LogisticRegression (Classifier.supervised.LogisticRegression)4 NaiveBayes (Classifier.supervised.NaiveBayes)4 DocAnalyzer (Analyzer.DocAnalyzer)3 LinearSVMMetricLearning (Classifier.metricLearning.LinearSVMMetricLearning)3 GaussianFields (Classifier.semisupervised.GaussianFields)3 GaussianFieldsByRandomWalk (Classifier.semisupervised.GaussianFieldsByRandomWalk)3 PageRank (influence.PageRank)3 FileOutputStream (java.io.FileOutputStream)3 ArrayList (java.util.ArrayList)3 Calendar (java.util.Calendar)3 LDA_Gibbs (topicmodels.LDA.LDA_Gibbs)3 LDA_Variational_multithread (topicmodels.multithreads.LDA.LDA_Variational_multithread)3 topicmodels.multithreads.pLSA.pLSA_multithread (topicmodels.multithreads.pLSA.pLSA_multithread)3 topicmodels.pLSA.pLSA (topicmodels.pLSA.pLSA)3 VctAnalyzer (Analyzer.VctAnalyzer)2