Search in sources :

Example 1 with DataSetType

use of edu.neu.ccs.pyramid.dataset.DataSetType in project pyramid by cheng-li.

the class GBRegressor method test.

private static void test(Config config) throws Exception {
    String output = config.getString("output.folder");
    File serializedModel = new File(output, "model");
    LSBoost lsBoost = (LSBoost) Serialization.deserialize(serializedModel);
    String sparsity = config.getString("input.matrixType");
    DataSetType dataSetType = null;
    switch(sparsity) {
        case "dense":
            dataSetType = DataSetType.REG_DENSE;
            break;
        case "sparse":
            dataSetType = DataSetType.REG_SPARSE;
            break;
        default:
            throw new IllegalArgumentException("input.matrixType should be dense or sparse");
    }
    RegDataSet testSet = TRECFormat.loadRegDataSet(config.getString("input.testData"), dataSetType, true);
    System.out.println("test RMSE = " + RMSE.rmse(lsBoost, testSet));
    File reportFile = new File(output, "test_predictions.txt");
    report(lsBoost, testSet, reportFile);
    System.out.println("predictions on the test set are written to " + reportFile.getAbsolutePath());
}
Also used : DataSetType(edu.neu.ccs.pyramid.dataset.DataSetType) RegDataSet(edu.neu.ccs.pyramid.dataset.RegDataSet) File(java.io.File) LSBoost(edu.neu.ccs.pyramid.regression.least_squares_boost.LSBoost)

Example 2 with DataSetType

use of edu.neu.ccs.pyramid.dataset.DataSetType in project pyramid by cheng-li.

the class GBClassifier method train.

private static void train(Config config) throws Exception {
    String sparsity = config.getString("input.matrixType");
    DataSetType dataSetType = null;
    switch(sparsity) {
        case "dense":
            dataSetType = DataSetType.CLF_DENSE;
            break;
        case "sparse":
            dataSetType = DataSetType.CLF_SPARSE;
            break;
        default:
            throw new IllegalArgumentException("input.matrixType should be dense or sparse");
    }
    ClfDataSet trainSet = TRECFormat.loadClfDataSet(config.getString("input.trainData"), dataSetType, true);
    ClfDataSet testSet = null;
    if (config.getBoolean("train.showTestProgress")) {
        testSet = TRECFormat.loadClfDataSet(config.getString("input.testData"), dataSetType, true);
    }
    int numClasses = trainSet.getNumClasses();
    LKBoost lkBoost = new LKBoost(numClasses);
    RegTreeConfig regTreeConfig = new RegTreeConfig().setMaxNumLeaves(config.getInt("train.numLeaves"));
    RegTreeFactory regTreeFactory = new RegTreeFactory(regTreeConfig);
    regTreeFactory.setLeafOutputCalculator(new LKBOutputCalculator(numClasses));
    LKBoostOptimizer optimizer = new LKBoostOptimizer(lkBoost, trainSet, regTreeFactory);
    optimizer.setShrinkage(config.getDouble("train.shrinkage"));
    optimizer.initialize();
    int progressInterval = config.getInt("train.showProgress.interval");
    int numIterations = config.getInt("train.numIterations");
    for (int i = 1; i <= numIterations; i++) {
        System.out.println("iteration " + i);
        optimizer.iterate();
        if (config.getBoolean("train.showTrainProgress") && (i % progressInterval == 0 || i == numIterations)) {
            System.out.println("training accuracy = " + Accuracy.accuracy(lkBoost, trainSet));
        }
        if (config.getBoolean("train.showTestProgress") && (i % progressInterval == 0 || i == numIterations)) {
            System.out.println("test accuracy = " + Accuracy.accuracy(lkBoost, testSet));
        }
    }
    System.out.println("training done!");
    String output = config.getString("output.folder");
    new File(output).mkdirs();
    File serializedModel = new File(output, "model");
    Serialization.serialize(lkBoost, serializedModel);
    System.out.println("model saved to " + serializedModel.getAbsolutePath());
    File pmmlModel = new File(output, "model.pmml");
    PMMLConverter.savePMML(lkBoost, pmmlModel);
    System.out.println("PMML model saved to " + pmmlModel.getAbsolutePath());
    File reportFile = new File(output, "train_predictions.txt");
    report(lkBoost, trainSet, reportFile);
    System.out.println("predictions on the training set are written to " + reportFile.getAbsolutePath());
    File probabilitiesFile = new File(output, "train_predicted_probabilities.txt");
    probabilities(lkBoost, trainSet, probabilitiesFile);
    System.out.println("predicted probabilities on the training set are written to " + probabilitiesFile.getAbsolutePath());
}
Also used : LKBoostOptimizer(edu.neu.ccs.pyramid.classification.lkboost.LKBoostOptimizer) RegTreeConfig(edu.neu.ccs.pyramid.regression.regression_tree.RegTreeConfig) DataSetType(edu.neu.ccs.pyramid.dataset.DataSetType) ClfDataSet(edu.neu.ccs.pyramid.dataset.ClfDataSet) RegTreeFactory(edu.neu.ccs.pyramid.regression.regression_tree.RegTreeFactory) LKBOutputCalculator(edu.neu.ccs.pyramid.classification.lkboost.LKBOutputCalculator) File(java.io.File) LKBoost(edu.neu.ccs.pyramid.classification.lkboost.LKBoost)

Example 3 with DataSetType

use of edu.neu.ccs.pyramid.dataset.DataSetType in project pyramid by cheng-li.

the class GBClassifier method test.

private static void test(Config config) throws Exception {
    String output = config.getString("output.folder");
    File serializedModel = new File(output, "model");
    LKBoost lkBoost = (LKBoost) Serialization.deserialize(serializedModel);
    String sparsity = config.getString("input.matrixType");
    DataSetType dataSetType = null;
    switch(sparsity) {
        case "dense":
            dataSetType = DataSetType.CLF_DENSE;
            break;
        case "sparse":
            dataSetType = DataSetType.CLF_SPARSE;
            break;
        default:
            throw new IllegalArgumentException("input.matrixType should be dense or sparse");
    }
    ClfDataSet testSet = TRECFormat.loadClfDataSet(config.getString("input.testData"), dataSetType, true);
    System.out.println("test accuracy = " + Accuracy.accuracy(lkBoost, testSet));
    File reportFile = new File(output, "test_predictions.txt");
    report(lkBoost, testSet, reportFile);
    System.out.println("predictions on the test set are written to " + reportFile.getAbsolutePath());
    File probabilitiesFile = new File(output, "test_predicted_probabilities.txt");
    probabilities(lkBoost, testSet, probabilitiesFile);
    System.out.println("predicted probabilities on the test set are written to " + probabilitiesFile.getAbsolutePath());
}
Also used : DataSetType(edu.neu.ccs.pyramid.dataset.DataSetType) ClfDataSet(edu.neu.ccs.pyramid.dataset.ClfDataSet) File(java.io.File) LKBoost(edu.neu.ccs.pyramid.classification.lkboost.LKBoost)

Example 4 with DataSetType

use of edu.neu.ccs.pyramid.dataset.DataSetType in project pyramid by cheng-li.

the class GBRegressor method train.

private static void train(Config config, Logger logger) throws Exception {
    String sparsity = config.getString("input.matrixType");
    DataSetType dataSetType = null;
    switch(sparsity) {
        case "dense":
            dataSetType = DataSetType.REG_DENSE;
            break;
        case "sparse":
            dataSetType = DataSetType.REG_SPARSE;
            break;
        default:
            throw new IllegalArgumentException("input.matrixType should be dense or sparse");
    }
    RegDataSet trainSet = TRECFormat.loadRegDataSet(config.getString("input.trainData"), dataSetType, true);
    RegDataSet testSet = null;
    if (config.getBoolean("train.showTestProgress")) {
        testSet = TRECFormat.loadRegDataSet(config.getString("input.testData"), dataSetType, true);
    }
    LSBoost lsBoost = new LSBoost();
    RegTreeConfig regTreeConfig = new RegTreeConfig().setMaxNumLeaves(config.getInt("train.numLeaves"));
    RegTreeFactory regTreeFactory = new RegTreeFactory(regTreeConfig);
    LSBoostOptimizer optimizer = new LSBoostOptimizer(lsBoost, trainSet, regTreeFactory);
    optimizer.setShrinkage(config.getDouble("train.shrinkage"));
    optimizer.initialize();
    int progressInterval = config.getInt("train.showProgress.interval");
    int numIterations = config.getInt("train.numIterations");
    for (int i = 1; i <= numIterations; i++) {
        logger.info("iteration " + i);
        optimizer.iterate();
        if (config.getBoolean("train.showTrainProgress") && (i % progressInterval == 0 || i == numIterations)) {
            logger.info("training RMSE = " + RMSE.rmse(lsBoost, trainSet));
        }
        if (config.getBoolean("train.showTestProgress") && (i % progressInterval == 0 || i == numIterations)) {
            logger.info("test RMSE = " + RMSE.rmse(lsBoost, testSet));
        }
    }
    logger.info("training done!");
    String output = config.getString("output.folder");
    new File(output).mkdirs();
    File serializedModel = new File(output, "model");
    Serialization.serialize(lsBoost, serializedModel);
    logger.info("model saved to " + serializedModel.getAbsolutePath());
    if (config.getBoolean("output.generatePMML")) {
        File pmmlModel = new File(output, "model.pmml");
        PMMLConverter.savePMML(lsBoost, pmmlModel);
        logger.info("PMML model saved to " + pmmlModel.getAbsolutePath());
    }
    String trainReportName = config.getString("output.trainReportFolderName");
    File reportFile = Paths.get(output, trainReportName, "train_predictions.txt").toFile();
    report(lsBoost, trainSet, reportFile);
    logger.info("predictions on the training set are written to " + reportFile.getAbsolutePath());
}
Also used : RegTreeConfig(edu.neu.ccs.pyramid.regression.regression_tree.RegTreeConfig) DataSetType(edu.neu.ccs.pyramid.dataset.DataSetType) LSBoostOptimizer(edu.neu.ccs.pyramid.regression.least_squares_boost.LSBoostOptimizer) RegTreeFactory(edu.neu.ccs.pyramid.regression.regression_tree.RegTreeFactory) RegDataSet(edu.neu.ccs.pyramid.dataset.RegDataSet) File(java.io.File) LSBoost(edu.neu.ccs.pyramid.regression.least_squares_boost.LSBoost)

Example 5 with DataSetType

use of edu.neu.ccs.pyramid.dataset.DataSetType in project pyramid by cheng-li.

the class LinearRegElasticNet method main.

public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        throw new IllegalArgumentException("Please specify a properties file.");
    }
    Config config = new Config(args[0]);
    System.out.println(config);
    String output = config.getString("output.folder");
    new File(output).mkdirs();
    String sparsity = config.getString("featureMatrix.sparsity").toLowerCase();
    DataSetType dataSetType = null;
    switch(sparsity) {
        case "dense":
            dataSetType = DataSetType.REG_DENSE;
            break;
        case "sparse":
            dataSetType = DataSetType.REG_SPARSE;
            break;
        default:
            throw new IllegalArgumentException("featureMatrix.sparsity can be either dense or sparse");
    }
    RegDataSet trainSet = TRECFormat.loadRegDataSet(config.getString("input.trainSet"), dataSetType, true);
    RegDataSet testSet = TRECFormat.loadRegDataSet(config.getString("input.testSet"), dataSetType, true);
    LinearRegression linearRegression = new LinearRegression(trainSet.getNumFeatures());
    ElasticNetLinearRegOptimizer optimizer = new ElasticNetLinearRegOptimizer(linearRegression, trainSet);
    optimizer.setRegularization(config.getDouble("regularization"));
    optimizer.setL1Ratio(config.getDouble("l1Ratio"));
    System.out.println("before training");
    System.out.println("training set RMSE = " + RMSE.rmse(linearRegression, trainSet));
    System.out.println("test set RMSE = " + RMSE.rmse(linearRegression, testSet));
    System.out.println("start training");
    StopWatch stopWatch = new StopWatch();
    stopWatch.start();
    optimizer.optimize();
    System.out.println("training done");
    System.out.println("time spent on training = " + stopWatch);
    System.out.println("after training");
    System.out.println("training set RMSE = " + RMSE.rmse(linearRegression, trainSet));
    System.out.println("test set RMSE = " + RMSE.rmse(linearRegression, testSet));
    System.out.println("number of non-zeros weights in linear regression (not including bias) = " + linearRegression.getWeights().getWeightsWithoutBias().getNumNonZeroElements());
    List<Pair<Integer, Double>> sorted = new ArrayList<>();
    for (Vector.Element element : linearRegression.getWeights().getWeightsWithoutBias().nonZeroes()) {
        sorted.add(new Pair<>(element.index(), element.get()));
    }
    Comparator<Pair<Integer, Double>> comparatorByIndex = Comparator.comparing(pair -> pair.getFirst());
    sorted = sorted.stream().sorted(comparatorByIndex).collect(Collectors.toList());
    StringBuilder sb1 = new StringBuilder();
    for (Pair<Integer, Double> pair : sorted) {
        int index = pair.getFirst();
        sb1.append(index).append("(").append(trainSet.getFeatureList().get(index).getName()).append(")").append(":").append(pair.getSecond()).append("\n");
    }
    FileUtils.writeStringToFile(new File(output, "features_sorted_by_indices.txt"), sb1.toString());
    System.out.println("all selected features (sorted by indices) are saved to " + new File(output, "features_sorted_by_indices.txt").getAbsolutePath());
    Comparator<Pair<Integer, Double>> comparator = Comparator.comparing(pair -> Math.abs(pair.getSecond()));
    sorted = sorted.stream().sorted(comparator.reversed()).collect(Collectors.toList());
    StringBuilder sb = new StringBuilder();
    for (Pair<Integer, Double> pair : sorted) {
        int index = pair.getFirst();
        sb.append(index).append("(").append(trainSet.getFeatureList().get(index).getName()).append(")").append(":").append(pair.getSecond()).append("\n");
    }
    FileUtils.writeStringToFile(new File(output, "features_sorted_by_weights.txt"), sb.toString());
    System.out.println("all selected features (sorted by absolute weights) are saved to " + new File(output, "features_sorted_by_weights.txt").getAbsolutePath());
    File reportFile = new File(output, "test_predictions.txt");
    report(linearRegression, testSet, reportFile);
    System.out.println("predictions on the test set are written to " + reportFile.getAbsolutePath());
}
Also used : DataSetType(edu.neu.ccs.pyramid.dataset.DataSetType) Config(edu.neu.ccs.pyramid.configuration.Config) ArrayList(java.util.ArrayList) StopWatch(org.apache.commons.lang3.time.StopWatch) ElasticNetLinearRegOptimizer(edu.neu.ccs.pyramid.regression.linear_regression.ElasticNetLinearRegOptimizer) RegDataSet(edu.neu.ccs.pyramid.dataset.RegDataSet) File(java.io.File) LinearRegression(edu.neu.ccs.pyramid.regression.linear_regression.LinearRegression) Vector(org.apache.mahout.math.Vector) Pair(edu.neu.ccs.pyramid.util.Pair)

Aggregations

DataSetType (edu.neu.ccs.pyramid.dataset.DataSetType)7 File (java.io.File)7 RegDataSet (edu.neu.ccs.pyramid.dataset.RegDataSet)5 LSBoost (edu.neu.ccs.pyramid.regression.least_squares_boost.LSBoost)4 RegTreeConfig (edu.neu.ccs.pyramid.regression.regression_tree.RegTreeConfig)3 RegTreeFactory (edu.neu.ccs.pyramid.regression.regression_tree.RegTreeFactory)3 LKBoost (edu.neu.ccs.pyramid.classification.lkboost.LKBoost)2 ClfDataSet (edu.neu.ccs.pyramid.dataset.ClfDataSet)2 LSBoostOptimizer (edu.neu.ccs.pyramid.regression.least_squares_boost.LSBoostOptimizer)2 LKBOutputCalculator (edu.neu.ccs.pyramid.classification.lkboost.LKBOutputCalculator)1 LKBoostOptimizer (edu.neu.ccs.pyramid.classification.lkboost.LKBoostOptimizer)1 Config (edu.neu.ccs.pyramid.configuration.Config)1 ElasticNetLinearRegOptimizer (edu.neu.ccs.pyramid.regression.linear_regression.ElasticNetLinearRegOptimizer)1 LinearRegression (edu.neu.ccs.pyramid.regression.linear_regression.LinearRegression)1 Pair (edu.neu.ccs.pyramid.util.Pair)1 ArrayList (java.util.ArrayList)1 StopWatch (org.apache.commons.lang3.time.StopWatch)1 Vector (org.apache.mahout.math.Vector)1