Search in sources :

Example 56 with StopWatch

use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.

the class ESIndex method getTerms.

/**
     *
     * @return terms stemmed
     */
public Set<String> getTerms(String id) throws IOException {
    StopWatch stopWatch = null;
    if (logger.isDebugEnabled()) {
        stopWatch = new StopWatch();
        stopWatch.start();
    }
    TermVectorResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(false).setFieldStatistics(false).setSelectedFields(this.bodyField).execute().actionGet();
    Terms terms = response.getFields().terms(this.bodyField);
    TermsEnum iterator = terms.iterator(null);
    Set<String> termsSet = new HashSet<>();
    for (int i = 0; i < terms.size(); i++) {
        String term = iterator.next().utf8ToString();
        termsSet.add(term);
    }
    if (logger.isDebugEnabled()) {
        logger.debug("time spent on getNgrams from doc " + id + " = " + stopWatch + " It has " + termsSet.size() + " ngrams");
    }
    return termsSet;
}
Also used : Terms(org.apache.lucene.index.Terms) TermVectorResponse(org.elasticsearch.action.termvector.TermVectorResponse) StopWatch(org.apache.commons.lang3.time.StopWatch) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 57 with StopWatch

use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.

the class MLACPlattScalingTest method test2.

private static void test2() throws Exception {
    MultiLabelClfDataSet dataSet = TRECFormat.loadMultiLabelClfDataSet(new File(DATASETS, "ohsumed/3/train.trec"), DataSetType.ML_CLF_SPARSE, true);
    List<MultiLabel> assignments = DataSetUtil.gatherMultiLabels(dataSet);
    MLLogisticTrainer trainer = MLLogisticTrainer.getBuilder().setGaussianPriorVariance(10000).build();
    MLLogisticRegression logisticRegression = trainer.train(dataSet, assignments);
    StopWatch stopWatch = new StopWatch();
    stopWatch.start();
    MLACPlattScaling plattScaling = new MLACPlattScaling(dataSet, logisticRegression);
    for (int i = 0; i < 10; i++) {
        System.out.println(Arrays.toString(logisticRegression.predictClassScores(dataSet.getRow(i))));
        System.out.println(Arrays.toString(logisticRegression.predictClassProbs(dataSet.getRow(i))));
        System.out.println(Arrays.toString(plattScaling.predictClassProbs(dataSet.getRow(i))));
        System.out.println("======================");
    }
}
Also used : MLLogisticTrainer(edu.neu.ccs.pyramid.multilabel_classification.multi_label_logistic_regression.MLLogisticTrainer) File(java.io.File) MLLogisticRegression(edu.neu.ccs.pyramid.multilabel_classification.multi_label_logistic_regression.MLLogisticRegression) StopWatch(org.apache.commons.lang3.time.StopWatch)

Example 58 with StopWatch

use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.

the class RidgeLogisticOptimizerTest method test3.

private static void test3() throws Exception {
    //        ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/imdb/3/train.trec"),
    //                DataSetType.CLF_SPARSE, true);
    //        ClfDataSet testSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/imdb/3/test.trec"),
    //                DataSetType.CLF_SPARSE, true);
    ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "20newsgroup/1/train.trec"), DataSetType.CLF_SPARSE, true);
    ClfDataSet testSet = TRECFormat.loadClfDataSet(new File(DATASETS, "20newsgroup/1/test.trec"), DataSetType.CLF_SPARSE, true);
    //        ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/spam/trec_data/train.trec"),
    //                DataSetType.CLF_SPARSE, true);
    //        ClfDataSet testSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/spam/trec_data/test.trec"),
    //                DataSetType.CLF_SPARSE, true);
    double variance = 1000;
    LogisticRegression logisticRegression = new LogisticRegression(dataSet.getNumClasses(), dataSet.getNumFeatures());
    Optimizable.ByGradientValue loss = new LogisticLoss(logisticRegression, dataSet, variance, true);
    //        GradientDescent optimizer = new GradientDescent(loss);
    LBFGS optimizer = new LBFGS(loss);
    System.out.println("after initialization");
    System.out.println("train acc = " + Accuracy.accuracy(logisticRegression, dataSet));
    System.out.println("test acc = " + Accuracy.accuracy(logisticRegression, testSet));
    StopWatch stopWatch = new StopWatch();
    stopWatch.start();
    for (int i = 0; i < 200; i++) {
        optimizer.iterate();
        System.out.println("after iteration " + i);
        System.out.println("loss = " + loss.getValue());
        System.out.println("train acc = " + Accuracy.accuracy(logisticRegression, dataSet));
        System.out.println("test acc = " + Accuracy.accuracy(logisticRegression, testSet));
    //            System.out.println(logisticRegression);
    }
}
Also used : ClfDataSet(edu.neu.ccs.pyramid.dataset.ClfDataSet) File(java.io.File) StopWatch(org.apache.commons.lang3.time.StopWatch)

Example 59 with StopWatch

use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.

the class RidgeLogisticTrainerTest method test3.

private static void test3() throws Exception {
    ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "20newsgroup/1/train.trec"), DataSetType.CLF_SPARSE, true);
    ClfDataSet testSet = TRECFormat.loadClfDataSet(new File(DATASETS, "20newsgroup/1/test.trec"), DataSetType.CLF_SPARSE, true);
    System.out.println(dataSet.getMetaInfo());
    RidgeLogisticTrainer trainer = RidgeLogisticTrainer.getBuilder().setEpsilon(0.01).setGaussianPriorVariance(0.5).setHistory(5).build();
    StopWatch stopWatch = new StopWatch();
    stopWatch.start();
    LogisticRegression logisticRegression = trainer.train(dataSet);
    System.out.println(stopWatch);
    System.out.println("train: " + Accuracy.accuracy(logisticRegression, dataSet));
    System.out.println("test: " + Accuracy.accuracy(logisticRegression, testSet));
}
Also used : ClfDataSet(edu.neu.ccs.pyramid.dataset.ClfDataSet) File(java.io.File) StopWatch(org.apache.commons.lang3.time.StopWatch)

Example 60 with StopWatch

use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.

the class LKTreeBoostTest method spam_missing_build.

static void spam_missing_build() throws Exception {
    ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/spam/missing_value/0.5_missing/train.trec"), DataSetType.CLF_DENSE, true);
    System.out.println(dataSet.getMetaInfo());
    LKBoost lkBoost = new LKBoost(2);
    LKBoostOptimizer trainer = new LKBoostOptimizer(lkBoost, dataSet);
    trainer.initialize();
    StopWatch stopWatch = new StopWatch();
    stopWatch.start();
    for (int round = 0; round < 200; round++) {
        System.out.println("round=" + round);
        trainer.iterate();
    }
    stopWatch.stop();
    System.out.println(stopWatch);
    double accuracy = Accuracy.accuracy(lkBoost, dataSet);
    System.out.println("accuracy=" + accuracy);
    int[] labels = dataSet.getLabels();
    List<double[]> classProbs = lkBoost.predictClassProbs(dataSet);
    for (int k = 0; k < dataSet.getNumClasses(); k++) {
        int numMatches = 0;
        double sumProbs = 0;
        for (int i = 0; i < dataSet.getNumDataPoints(); i++) {
            if (labels[i] == k) {
                numMatches += 1;
            }
            sumProbs += classProbs.get(i)[k];
        }
        System.out.println("for class " + k);
        System.out.println("number of matches =" + numMatches);
        System.out.println("sum of probs = " + sumProbs);
    }
    lkBoost.serialize(new File(TMP, "/LKTreeBoostTest/ensemble.ser"));
}
Also used : File(java.io.File) StopWatch(org.apache.commons.lang3.time.StopWatch)

Aggregations

StopWatch (org.apache.commons.lang3.time.StopWatch)78 File (java.io.File)48 ArrayList (java.util.ArrayList)17 ClfDataSet (edu.neu.ccs.pyramid.dataset.ClfDataSet)8 Vector (org.apache.mahout.math.Vector)8 VirtualMachine (com.microsoft.azure.management.compute.VirtualMachine)7 Creatable (com.microsoft.azure.management.resources.fluentcore.model.Creatable)7 Config (edu.neu.ccs.pyramid.configuration.Config)7 Network (com.microsoft.azure.management.network.Network)6 IOException (java.io.IOException)6 ResourceGroup (com.microsoft.azure.management.resources.ResourceGroup)5 PriorProbClassifier (edu.neu.ccs.pyramid.classification.PriorProbClassifier)5 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)4 Region (com.microsoft.azure.management.resources.fluentcore.arm.Region)4 StorageAccount (com.microsoft.azure.management.storage.StorageAccount)4 List (java.util.List)4 PublicIPAddress (com.microsoft.azure.management.network.PublicIPAddress)3 LogisticRegression (edu.neu.ccs.pyramid.classification.logistic_regression.LogisticRegression)3 HashMap (java.util.HashMap)3 IntStream (java.util.stream.IntStream)3