Search in sources :

Example 21 with MultiLabelClfDataSet

use of edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet in project pyramid by cheng-li.

the class MultiLabelSynthesizer method gaussianNoise.

/**
     * 2 labels, 3 features, multi-variate gaussian noise
     * y0: w=(0,1,0)
     * y1: w=(1,0,0)
     * y2: w=(0,0,1)
     * @return
     */
public static MultiLabelClfDataSet gaussianNoise(int numData) {
    int numClass = 3;
    int numFeature = 3;
    MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numFeatures(numFeature).numClasses(numClass).numDataPoints(numData).build();
    // generate weights
    Vector[] weights = new Vector[numClass];
    for (int k = 0; k < numClass; k++) {
        Vector vector = new DenseVector(numFeature);
        weights[k] = vector;
    }
    weights[0].set(1, 1);
    weights[1].set(0, 1);
    weights[2].set(2, 1);
    // generate features
    for (int i = 0; i < numData; i++) {
        for (int j = 0; j < numFeature; j++) {
            dataSet.setFeatureValue(i, j, Sampling.doubleUniform(-1, 1));
        }
    }
    double[] means = new double[numClass];
    double[][] covars = new double[numClass][numClass];
    covars[0][0] = 0.5;
    covars[0][1] = 0.02;
    covars[1][0] = 0.02;
    covars[0][2] = -0.03;
    covars[2][0] = -0.03;
    covars[1][1] = 0.2;
    covars[1][2] = -0.03;
    covars[2][1] = -0.03;
    covars[2][2] = 0.3;
    MultivariateNormalDistribution distribution = new MultivariateNormalDistribution(means, covars);
    // assign labels
    int numFlipped = 0;
    for (int i = 0; i < numData; i++) {
        double[] noises = distribution.sample();
        for (int k = 0; k < numClass; k++) {
            double dot = weights[k].dot(dataSet.getRow(i));
            double score = dot + noises[k];
            if (score >= 0) {
                dataSet.addLabel(i, k);
            }
            if (dot * score < 0) {
                numFlipped += 1;
            }
        }
    }
    System.out.println("number of flipped bits = " + numFlipped);
    return dataSet;
}
Also used : DenseVector(org.apache.mahout.math.DenseVector) Vector(org.apache.mahout.math.Vector) MultiLabelClfDataSet(edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet) DenseVector(org.apache.mahout.math.DenseVector) MultivariateNormalDistribution(org.apache.commons.math3.distribution.MultivariateNormalDistribution)

Example 22 with MultiLabelClfDataSet

use of edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet in project pyramid by cheng-li.

the class MultiLabelSynthesizer method sampleFromMix.

/**
     * C0, y0: w=(0,1)
     * C0, y1: w=(1,1)
     * C1, y0: w=(1,0)
     * C1, y1: w=(1,-1)
     * @return
     */
public static MultiLabelClfDataSet sampleFromMix() {
    int numData = 10000;
    int numClass = 2;
    int numFeature = 2;
    int numClusters = 2;
    double[] proportions = { 0.4, 0.6 };
    int[] indices = { 0, 1 };
    MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numFeatures(numFeature).numClasses(numClass).numDataPoints(numData).build();
    // generate weights
    Vector[][] weights = new Vector[numClusters][numClass];
    for (int c = 0; c < numClusters; c++) {
        for (int l = 0; l < numClass; l++) {
            Vector vector = new DenseVector(numFeature);
            weights[c][l] = vector;
        }
    }
    weights[0][0].set(0, 0);
    weights[0][0].set(1, 1);
    weights[0][1].set(0, 1);
    weights[0][1].set(1, 1);
    weights[1][0].set(0, 1);
    weights[1][0].set(1, 0);
    weights[1][1].set(0, 1);
    weights[1][1].set(1, -1);
    // generate features
    for (int i = 0; i < numData; i++) {
        for (int j = 0; j < numFeature; j++) {
            dataSet.setFeatureValue(i, j, Sampling.doubleUniform(-1, 1));
        }
    }
    IntegerDistribution distribution = new EnumeratedIntegerDistribution(indices, proportions);
    // assign labels
    for (int i = 0; i < numData; i++) {
        int cluster = distribution.sample();
        System.out.println("cluster " + cluster);
        for (int l = 0; l < numClass; l++) {
            System.out.println("row = " + dataSet.getRow(i));
            System.out.println("weight = " + weights[cluster][l]);
            double dot = weights[cluster][l].dot(dataSet.getRow(i));
            System.out.println("dot = " + dot);
            if (dot >= 0) {
                dataSet.addLabel(i, l);
            }
        }
    }
    return dataSet;
}
Also used : EnumeratedIntegerDistribution(org.apache.commons.math3.distribution.EnumeratedIntegerDistribution) IntegerDistribution(org.apache.commons.math3.distribution.IntegerDistribution) EnumeratedIntegerDistribution(org.apache.commons.math3.distribution.EnumeratedIntegerDistribution) DenseVector(org.apache.mahout.math.DenseVector) Vector(org.apache.mahout.math.Vector) MultiLabelClfDataSet(edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet) DenseVector(org.apache.mahout.math.DenseVector)

Example 23 with MultiLabelClfDataSet

use of edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet in project pyramid by cheng-li.

the class MultiLabelSynthesizer method independent.

/**
     * y0: w=(0,1)
     * y1: w=(1,1)
     * y2: w=(1,0)
     * y3: w=(1,-1)
     * @return
     */
public static MultiLabelClfDataSet independent() {
    int numData = 10000;
    int numClass = 4;
    int numFeature = 2;
    MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numFeatures(numFeature).numClasses(numClass).numDataPoints(numData).build();
    // generate weights
    Vector[] weights = new Vector[numClass];
    for (int k = 0; k < numClass; k++) {
        Vector vector = new DenseVector(numFeature);
        weights[k] = vector;
    }
    weights[0].set(0, 0);
    weights[0].set(1, 1);
    weights[1].set(0, 1);
    weights[1].set(1, 1);
    weights[2].set(0, 1);
    weights[2].set(1, 0);
    weights[3].set(0, 1);
    weights[3].set(1, -1);
    // generate features
    for (int i = 0; i < numData; i++) {
        for (int j = 0; j < numFeature; j++) {
            dataSet.setFeatureValue(i, j, Sampling.doubleUniform(-1, 1));
        }
    }
    // assign labels
    for (int i = 0; i < numData; i++) {
        for (int k = 0; k < numClass; k++) {
            double dot = weights[k].dot(dataSet.getRow(i));
            double score = dot;
            if (score >= 0) {
                dataSet.addLabel(i, k);
            }
        }
    }
    return dataSet;
}
Also used : DenseVector(org.apache.mahout.math.DenseVector) Vector(org.apache.mahout.math.Vector) MultiLabelClfDataSet(edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet) DenseVector(org.apache.mahout.math.DenseVector)

Example 24 with MultiLabelClfDataSet

use of edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet in project pyramid by cheng-li.

the class MultiLabelSynthesizer method randomTwoLabels.

/**
     * 30: 1,1
     * 40: 1,0
     * 30: 0,1
     * @return
     */
public static MultiLabelClfDataSet randomTwoLabels() {
    MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numFeatures(1).numClasses(2).numDataPoints(100).build();
    for (int i = 0; i < 30; i++) {
        dataSet.addLabel(i, 0);
        dataSet.addLabel(i, 1);
    }
    for (int i = 30; i < 70; i++) {
        dataSet.addLabel(i, 0);
    }
    for (int i = 70; i < 100; i++) {
        dataSet.addLabel(i, 1);
    }
    return dataSet;
}
Also used : MultiLabelClfDataSet(edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet)

Example 25 with MultiLabelClfDataSet

use of edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet in project pyramid by cheng-li.

the class MultiLabelSynthesizer method flipOne.

public static MultiLabelClfDataSet flipOne(int numData, int numFeature, int numClass) {
    MultiLabelClfDataSet dataSet = MLClfDataSetBuilder.getBuilder().numFeatures(numFeature).numClasses(numClass).numDataPoints(numData).build();
    // generate weights
    Vector[] weights = new Vector[numClass];
    for (int k = 0; k < numClass; k++) {
        Vector vector = new DenseVector(numFeature);
        for (int j = 0; j < numFeature; j++) {
            vector.set(j, Sampling.doubleUniform(-1, 1));
        }
        weights[k] = vector;
    }
    // generate features
    for (int i = 0; i < numData; i++) {
        for (int j = 0; j < numFeature; j++) {
            dataSet.setFeatureValue(i, j, Sampling.doubleUniform(-1, 1));
        }
    }
    // assign labels
    for (int i = 0; i < numData; i++) {
        for (int k = 0; k < numClass; k++) {
            double dot = weights[k].dot(dataSet.getRow(i));
            if (dot >= 0) {
                dataSet.addLabel(i, k);
            }
        }
    }
    // flip
    for (int i = 0; i < numData; i++) {
        int toChange = Sampling.intUniform(0, numClass - 1);
        MultiLabel label = dataSet.getMultiLabels()[i];
        if (label.matchClass(toChange)) {
            label.removeLabel(toChange);
        } else {
            label.addLabel(toChange);
        }
    }
    return dataSet;
}
Also used : MultiLabel(edu.neu.ccs.pyramid.dataset.MultiLabel) DenseVector(org.apache.mahout.math.DenseVector) Vector(org.apache.mahout.math.Vector) MultiLabelClfDataSet(edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet) DenseVector(org.apache.mahout.math.DenseVector)

Aggregations

MultiLabelClfDataSet (edu.neu.ccs.pyramid.dataset.MultiLabelClfDataSet)48 File (java.io.File)24 MultiLabel (edu.neu.ccs.pyramid.dataset.MultiLabel)23 CMLCRF (edu.neu.ccs.pyramid.multilabel_classification.crf.CMLCRF)13 MLMeasures (edu.neu.ccs.pyramid.eval.MLMeasures)12 LBFGS (edu.neu.ccs.pyramid.optimization.LBFGS)9 Vector (org.apache.mahout.math.Vector)9 Config (edu.neu.ccs.pyramid.configuration.Config)7 CRFLoss (edu.neu.ccs.pyramid.multilabel_classification.crf.CRFLoss)7 DenseVector (org.apache.mahout.math.DenseVector)7 MultiLabelClassifier (edu.neu.ccs.pyramid.multilabel_classification.MultiLabelClassifier)5 Pair (edu.neu.ccs.pyramid.util.Pair)5 java.util (java.util)5 Collectors (java.util.stream.Collectors)5 IntStream (java.util.stream.IntStream)5 DataSetUtil (edu.neu.ccs.pyramid.dataset.DataSetUtil)4 TRECFormat (edu.neu.ccs.pyramid.dataset.TRECFormat)4 MLScorer (edu.neu.ccs.pyramid.multilabel_classification.MLScorer)4 StopWatch (org.apache.commons.lang3.time.StopWatch)4 AccScorer (edu.neu.ccs.pyramid.multilabel_classification.AccScorer)3