use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.
the class ESIndex method getTerms.
/**
*
* @return terms stemmed
*/
public Set<String> getTerms(String id) throws IOException {
StopWatch stopWatch = null;
if (logger.isDebugEnabled()) {
stopWatch = new StopWatch();
stopWatch.start();
}
TermVectorResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(false).setFieldStatistics(false).setSelectedFields(this.bodyField).execute().actionGet();
Terms terms = response.getFields().terms(this.bodyField);
TermsEnum iterator = terms.iterator(null);
Set<String> termsSet = new HashSet<>();
for (int i = 0; i < terms.size(); i++) {
String term = iterator.next().utf8ToString();
termsSet.add(term);
}
if (logger.isDebugEnabled()) {
logger.debug("time spent on getNgrams from doc " + id + " = " + stopWatch + " It has " + termsSet.size() + " ngrams");
}
return termsSet;
}
use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.
the class MLACPlattScalingTest method test2.
private static void test2() throws Exception {
MultiLabelClfDataSet dataSet = TRECFormat.loadMultiLabelClfDataSet(new File(DATASETS, "ohsumed/3/train.trec"), DataSetType.ML_CLF_SPARSE, true);
List<MultiLabel> assignments = DataSetUtil.gatherMultiLabels(dataSet);
MLLogisticTrainer trainer = MLLogisticTrainer.getBuilder().setGaussianPriorVariance(10000).build();
MLLogisticRegression logisticRegression = trainer.train(dataSet, assignments);
StopWatch stopWatch = new StopWatch();
stopWatch.start();
MLACPlattScaling plattScaling = new MLACPlattScaling(dataSet, logisticRegression);
for (int i = 0; i < 10; i++) {
System.out.println(Arrays.toString(logisticRegression.predictClassScores(dataSet.getRow(i))));
System.out.println(Arrays.toString(logisticRegression.predictClassProbs(dataSet.getRow(i))));
System.out.println(Arrays.toString(plattScaling.predictClassProbs(dataSet.getRow(i))));
System.out.println("======================");
}
}
use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.
the class RidgeLogisticOptimizerTest method test3.
private static void test3() throws Exception {
// ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/imdb/3/train.trec"),
// DataSetType.CLF_SPARSE, true);
// ClfDataSet testSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/imdb/3/test.trec"),
// DataSetType.CLF_SPARSE, true);
ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "20newsgroup/1/train.trec"), DataSetType.CLF_SPARSE, true);
ClfDataSet testSet = TRECFormat.loadClfDataSet(new File(DATASETS, "20newsgroup/1/test.trec"), DataSetType.CLF_SPARSE, true);
// ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/spam/trec_data/train.trec"),
// DataSetType.CLF_SPARSE, true);
// ClfDataSet testSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/spam/trec_data/test.trec"),
// DataSetType.CLF_SPARSE, true);
double variance = 1000;
LogisticRegression logisticRegression = new LogisticRegression(dataSet.getNumClasses(), dataSet.getNumFeatures());
Optimizable.ByGradientValue loss = new LogisticLoss(logisticRegression, dataSet, variance, true);
// GradientDescent optimizer = new GradientDescent(loss);
LBFGS optimizer = new LBFGS(loss);
System.out.println("after initialization");
System.out.println("train acc = " + Accuracy.accuracy(logisticRegression, dataSet));
System.out.println("test acc = " + Accuracy.accuracy(logisticRegression, testSet));
StopWatch stopWatch = new StopWatch();
stopWatch.start();
for (int i = 0; i < 200; i++) {
optimizer.iterate();
System.out.println("after iteration " + i);
System.out.println("loss = " + loss.getValue());
System.out.println("train acc = " + Accuracy.accuracy(logisticRegression, dataSet));
System.out.println("test acc = " + Accuracy.accuracy(logisticRegression, testSet));
// System.out.println(logisticRegression);
}
}
use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.
the class RidgeLogisticTrainerTest method test3.
private static void test3() throws Exception {
ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "20newsgroup/1/train.trec"), DataSetType.CLF_SPARSE, true);
ClfDataSet testSet = TRECFormat.loadClfDataSet(new File(DATASETS, "20newsgroup/1/test.trec"), DataSetType.CLF_SPARSE, true);
System.out.println(dataSet.getMetaInfo());
RidgeLogisticTrainer trainer = RidgeLogisticTrainer.getBuilder().setEpsilon(0.01).setGaussianPriorVariance(0.5).setHistory(5).build();
StopWatch stopWatch = new StopWatch();
stopWatch.start();
LogisticRegression logisticRegression = trainer.train(dataSet);
System.out.println(stopWatch);
System.out.println("train: " + Accuracy.accuracy(logisticRegression, dataSet));
System.out.println("test: " + Accuracy.accuracy(logisticRegression, testSet));
}
use of org.apache.commons.lang3.time.StopWatch in project pyramid by cheng-li.
the class LKTreeBoostTest method spam_missing_build.
static void spam_missing_build() throws Exception {
ClfDataSet dataSet = TRECFormat.loadClfDataSet(new File(DATASETS, "/spam/missing_value/0.5_missing/train.trec"), DataSetType.CLF_DENSE, true);
System.out.println(dataSet.getMetaInfo());
LKBoost lkBoost = new LKBoost(2);
LKBoostOptimizer trainer = new LKBoostOptimizer(lkBoost, dataSet);
trainer.initialize();
StopWatch stopWatch = new StopWatch();
stopWatch.start();
for (int round = 0; round < 200; round++) {
System.out.println("round=" + round);
trainer.iterate();
}
stopWatch.stop();
System.out.println(stopWatch);
double accuracy = Accuracy.accuracy(lkBoost, dataSet);
System.out.println("accuracy=" + accuracy);
int[] labels = dataSet.getLabels();
List<double[]> classProbs = lkBoost.predictClassProbs(dataSet);
for (int k = 0; k < dataSet.getNumClasses(); k++) {
int numMatches = 0;
double sumProbs = 0;
for (int i = 0; i < dataSet.getNumDataPoints(); i++) {
if (labels[i] == k) {
numMatches += 1;
}
sumProbs += classProbs.get(i)[k];
}
System.out.println("for class " + k);
System.out.println("number of matches =" + numMatches);
System.out.println("sum of probs = " + sumProbs);
}
lkBoost.serialize(new File(TMP, "/LKTreeBoostTest/ensemble.ser"));
}
Aggregations