Search in sources :

Example 1 with DataSourceProvenance

use of org.tribuo.provenance.DataSourceProvenance in project tribuo by oracle.

the class LabelledDataGenerator method sparseTrainTest.

/**
 * Generates a pair of datasets, where the features are sparse,
 * and unknown features appear in the test data. It has the same
 * 4 classes {Foo,Bar,Baz,Quux}.
 * @param negate Supply -1.0 to negate some values in this dataset.
 * @return A pair of train and test datasets.
 */
public static Pair<Dataset<Label>, Dataset<Label>> sparseTrainTest(double negate) {
    DataSourceProvenance provenance = new SimpleDataSourceProvenance("TrainingData", OffsetDateTime.now(), labelFactory);
    MutableDataset<Label> train = new MutableDataset<>(provenance, labelFactory);
    String[] names = new String[] { "A", "B", "C", "D" };
    double[] values = new double[] { 1.0, 0.5, 1.0, negate * 1.0 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "B", "D", "F", "H" };
    values = new double[] { 1.5, 0.35, 1.3, negate * 1.2 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "A", "J", "D", "M" };
    values = new double[] { 1.2, 0.45, 1.5, negate * 1.0 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "C", "E", "F", "H" };
    values = new double[] { negate * 1.1, 0.55, negate * 1.5, 0.5 };
    train.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "E", "G", "F", "I" };
    values = new double[] { negate * 1.5, 0.25, negate * 1, 0.125 };
    train.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "J", "K", "C", "E" };
    values = new double[] { negate * 1, 0.5, negate * 1.123, 0.123 };
    train.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "E", "A", "K", "J" };
    values = new double[] { 1.5, 5.0, 0.5, 4.5 };
    train.add(new ArrayExample<>(new Label("Baz"), names, values));
    names = new String[] { "B", "C", "E", "H" };
    values = new double[] { 1.234, 5.1235, 0.1235, 6.0 };
    train.add(new ArrayExample<>(new Label("Baz"), names, values));
    names = new String[] { "A", "M", "I", "J" };
    values = new double[] { 1.734, 4.5, 0.5123, 5.5 };
    train.add(new ArrayExample<>(new Label("Baz"), names, values));
    names = new String[] { "Z", "A", "B", "C" };
    values = new double[] { negate * 1, 0.25, 5, 10.0 };
    train.add(new ArrayExample<>(new Label("Quux"), names, values));
    names = new String[] { "K", "V", "E", "D" };
    values = new double[] { negate * 1.4, 0.55, 5.65, 12.0 };
    train.add(new ArrayExample<>(new Label("Quux"), names, values));
    names = new String[] { "B", "G", "E", "A" };
    values = new double[] { negate * 1.9, 0.25, 5.9, 15 };
    train.add(new ArrayExample<>(new Label("Quux"), names, values));
    DataSourceProvenance testProvenance = new SimpleDataSourceProvenance("TestingData", OffsetDateTime.now(), labelFactory);
    MutableDataset<Label> test = new MutableDataset<>(testProvenance, labelFactory);
    names = new String[] { "AA", "B", "C", "D" };
    values = new double[] { 2.0, 0.45, 3.5, negate * 2.0 };
    test.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "B", "BB", "F", "E" };
    values = new double[] { negate * 2.0, 0.55, negate * 2.5, 2.5 };
    test.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "B", "E", "G", "H" };
    values = new double[] { 1.75, 5.0, 1.0, 6.5 };
    test.add(new ArrayExample<>(new Label("Baz"), names, values));
    names = new String[] { "B", "CC", "DD", "EE" };
    values = new double[] { negate * 1.5, 0.25, 5.0, 20.0 };
    test.add(new ArrayExample<>(new Label("Quux"), names, values));
    return new Pair<>(train, test);
}
Also used : SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance) Label(org.tribuo.classification.Label) MutableDataset(org.tribuo.MutableDataset) DataSourceProvenance(org.tribuo.provenance.DataSourceProvenance) SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance) Pair(com.oracle.labs.mlrg.olcut.util.Pair)

Example 2 with DataSourceProvenance

use of org.tribuo.provenance.DataSourceProvenance in project tribuo by oracle.

the class MultiLabelDataGenerator method generateTestData.

/**
 * Simple test data for checking multi-label trainers.
 * @return Simple 3 class test data.
 */
public static Dataset<MultiLabel> generateTestData() {
    DataSourceProvenance provenance = new SimpleDataSourceProvenance("TestingData", OffsetDateTime.now(), factory);
    MutableDataset<MultiLabel> dataset = new MutableDataset<>(provenance, factory);
    ArrayExample<MultiLabel> e = new ArrayExample<>(factory.generateOutput("MONKEY,PUZZLE,TREE"));
    e.add(new Feature("A-MONKEY", 1.0));
    e.add(new Feature("B-PUZZLE", 1.0));
    e.add(new Feature("C-TREE", 1.0));
    dataset.add(e);
    e = new ArrayExample<>(factory.generateOutput("MONKEY"));
    e.add(new Feature("A-MONKEY", 1.0));
    e.add(new Feature("B-PUZZLE", 0.0));
    e.add(new Feature("C-TREE", 0.0));
    dataset.add(e);
    e = new ArrayExample<>(factory.generateOutput("PUZZLE"));
    e.add(new Feature("A-MONKEY", 0.0));
    e.add(new Feature("B-PUZZLE", 1.0));
    e.add(new Feature("C-TREE", 0.0));
    dataset.add(e);
    e = new ArrayExample<>(factory.generateOutput("TREE"));
    e.add(new Feature("A-MONKEY", 0.0));
    e.add(new Feature("B-PUZZLE", 0.0));
    e.add(new Feature("C-TREE", 1.0));
    dataset.add(e);
    return dataset;
}
Also used : ArrayExample(org.tribuo.impl.ArrayExample) MultiLabel(org.tribuo.multilabel.MultiLabel) SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance) MutableDataset(org.tribuo.MutableDataset) Feature(org.tribuo.Feature) DataSourceProvenance(org.tribuo.provenance.DataSourceProvenance) SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance)

Example 3 with DataSourceProvenance

use of org.tribuo.provenance.DataSourceProvenance in project tribuo by oracle.

the class LabelledDataGenerator method denseTrainTest.

/**
 * Generates a train/test dataset pair which is dense in the features,
 * each example has 4 features,{A,B,C,D}, and there are 4 classes,
 * {Foo,Bar,Baz,Quux}.
 * @param negate Supply -1.0 to insert some negative values into the dataset.
 * @return A pair of datasets.
 */
public static Pair<Dataset<Label>, Dataset<Label>> denseTrainTest(double negate) {
    DataSourceProvenance provenance = new SimpleDataSourceProvenance("TrainingData", OffsetDateTime.now(), labelFactory);
    MutableDataset<Label> train = new MutableDataset<>(provenance, labelFactory);
    String[] names = new String[] { "A", "B", "C", "D" };
    double[] values = new double[] { 1.0, 0.5, 1.0, negate * 1.0 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    values = new double[] { 1.5, 0.35, 1.3, negate * 1.2 };
    train.add(new ArrayExample<>(new Label("Foo"), names.clone(), values));
    values = new double[] { 1.2, 0.45, 1.5, negate * 1.0 };
    train.add(new ArrayExample<>(new Label("Foo"), names.clone(), values));
    values = new double[] { negate * 1.1, 0.55, negate * 1.5, 0.5 };
    train.add(new ArrayExample<>(new Label("Bar"), names.clone(), values));
    values = new double[] { negate * 1.5, 0.25, negate * 1, 0.125 };
    train.add(new ArrayExample<>(new Label("Bar"), names.clone(), values));
    values = new double[] { negate * 1, 0.5, negate * 1.123, 0.123 };
    train.add(new ArrayExample<>(new Label("Bar"), names.clone(), values));
    values = new double[] { 1.5, 5.0, 0.5, 4.5 };
    train.add(new ArrayExample<>(new Label("Baz"), names.clone(), values));
    values = new double[] { 1.234, 5.1235, 0.1235, 6.0 };
    train.add(new ArrayExample<>(new Label("Baz"), names.clone(), values));
    values = new double[] { 1.734, 4.5, 0.5123, 5.5 };
    train.add(new ArrayExample<>(new Label("Baz"), names.clone(), values));
    values = new double[] { negate * 1, 0.25, 5, 10.0 };
    train.add(new ArrayExample<>(new Label("Quux"), names.clone(), values));
    values = new double[] { negate * 1.4, 0.55, 5.65, 12.0 };
    train.add(new ArrayExample<>(new Label("Quux"), names.clone(), values));
    values = new double[] { negate * 1.9, 0.25, 5.9, 15 };
    train.add(new ArrayExample<>(new Label("Quux"), names.clone(), values));
    DataSourceProvenance testProvenance = new SimpleDataSourceProvenance("TestingData", OffsetDateTime.now(), labelFactory);
    MutableDataset<Label> test = new MutableDataset<>(testProvenance, labelFactory);
    values = new double[] { 2.0, 0.45, 3.5, negate * 2.0 };
    test.add(new ArrayExample<>(new Label("Foo"), names.clone(), values));
    values = new double[] { negate * 2.0, 0.55, negate * 2.5, 2.5 };
    test.add(new ArrayExample<>(new Label("Bar"), names.clone(), values));
    values = new double[] { 1.75, 5.0, 1.0, 6.5 };
    test.add(new ArrayExample<>(new Label("Baz"), names.clone(), values));
    values = new double[] { negate * 1.5, 0.25, 5.0, 20.0 };
    test.add(new ArrayExample<>(new Label("Quux"), names.clone(), values));
    return new Pair<>(train, test);
}
Also used : SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance) Label(org.tribuo.classification.Label) MutableDataset(org.tribuo.MutableDataset) DataSourceProvenance(org.tribuo.provenance.DataSourceProvenance) SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance) Pair(com.oracle.labs.mlrg.olcut.util.Pair)

Example 4 with DataSourceProvenance

use of org.tribuo.provenance.DataSourceProvenance in project tribuo by oracle.

the class LabelledDataGenerator method binarySparseTrainTest.

/**
 * Generates a pair of datasets with sparse features and unknown features
 * in the test data. Has binary labels {Foo,Bar}.
 * @param negate Supply -1.0 to negate some values in this dataset.
 * @return A pair of train and test datasets.
 */
public static Pair<Dataset<Label>, Dataset<Label>> binarySparseTrainTest(double negate) {
    DataSourceProvenance provenance = new SimpleDataSourceProvenance("TrainingData", OffsetDateTime.now(), labelFactory);
    MutableDataset<Label> train = new MutableDataset<>(provenance, labelFactory);
    String[] names = new String[] { "A", "B", "C", "D" };
    double[] values = new double[] { 1.0, 0.5, 1.0, negate * 1.0 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "B", "D", "F", "H" };
    values = new double[] { 1.5, 0.35, 1.3, negate * 1.2 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "A", "J", "D", "M" };
    values = new double[] { 1.2, 0.45, 1.5, negate * 1.0 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "C", "E", "F", "H" };
    values = new double[] { negate * 1.1, 0.55, negate * 1.5, 0.5 };
    train.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "E", "G", "F", "I" };
    values = new double[] { negate * 1.5, 0.25, negate * 1, 0.125 };
    train.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "J", "K", "C", "E" };
    values = new double[] { negate * 1, 0.5, negate * 1.123, 0.123 };
    train.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "E", "A", "K", "J" };
    values = new double[] { 1.5, 5.0, 0.5, 4.5 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "B", "C", "E", "H" };
    values = new double[] { 1.234, 5.1235, 0.1235, 6.0 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "A", "M", "I", "J" };
    values = new double[] { 1.734, 4.5, 0.5123, 5.5 };
    train.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "Z", "A", "B", "C" };
    values = new double[] { negate * 1, 0.25, 5, 10.0 };
    train.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "K", "V", "E", "D" };
    values = new double[] { negate * 1.4, 0.55, 5.65, 12.0 };
    train.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "B", "G", "E", "A" };
    values = new double[] { negate * 1.9, 0.25, 5.9, 15 };
    train.add(new ArrayExample<>(new Label("Bar"), names, values));
    DataSourceProvenance testProvenance = new SimpleDataSourceProvenance("TestingData", OffsetDateTime.now(), labelFactory);
    MutableDataset<Label> test = new MutableDataset<>(testProvenance, labelFactory);
    names = new String[] { "AA", "B", "C", "D" };
    values = new double[] { 2.0, 0.45, 3.5, negate * 2.0 };
    test.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "B", "BB", "F", "E" };
    values = new double[] { negate * 2.0, 0.55, negate * 2.5, 2.5 };
    test.add(new ArrayExample<>(new Label("Bar"), names, values));
    names = new String[] { "B", "E", "G", "H" };
    values = new double[] { 1.75, 5.0, 1.0, 6.5 };
    test.add(new ArrayExample<>(new Label("Foo"), names, values));
    names = new String[] { "B", "CC", "DD", "EE" };
    values = new double[] { negate * 1.5, 0.25, 5.0, 20.0 };
    test.add(new ArrayExample<>(new Label("Bar"), names, values));
    return new Pair<>(train, test);
}
Also used : SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance) Label(org.tribuo.classification.Label) MutableDataset(org.tribuo.MutableDataset) DataSourceProvenance(org.tribuo.provenance.DataSourceProvenance) SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance) Pair(com.oracle.labs.mlrg.olcut.util.Pair)

Example 5 with DataSourceProvenance

use of org.tribuo.provenance.DataSourceProvenance in project tribuo by oracle.

the class RegressionDataGenerator method denseTrainTest.

/**
 * Generates a train/test dataset pair which is dense in the features,
 * each example has 4 features,{A,B,C,D}.
 * @param negate Supply -1.0 to negate some values in this dataset.
 * @return A pair of datasets.
 */
public static Pair<Dataset<Regressor>, Dataset<Regressor>> denseTrainTest(double negate) {
    DataSourceProvenance provenance = new SimpleDataSourceProvenance("TrainingData", OffsetDateTime.now(), REGRESSION_FACTORY);
    MutableDataset<Regressor> train = new MutableDataset<>(provenance, REGRESSION_FACTORY);
    String[] names = new String[] { "A", "B", "C", "D" };
    double[] values = new double[] { 1.0, 0.5, 1.0, negate * 1.0 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 5.0), names, values));
    values = new double[] { 1.5, 0.35, 1.3, negate * 1.2 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 5.8), names, values));
    values = new double[] { 1.2, 0.45, 1.5, negate * 1.0 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 8.0), names, values));
    values = new double[] { negate * 1.1, 0.55, negate * 1.5, 0.5 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 10.0), names, values));
    values = new double[] { negate * 1.5, 0.25, negate * 1, 0.125 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 10.0), names, values));
    values = new double[] { negate * 1, 0.5, negate * 1.123, 0.123 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 10.0), names, values));
    values = new double[] { 1.5, 5.0, 0.5, 4.5 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 20), names, values));
    values = new double[] { 1.234, 5.1235, 0.1235, 6.0 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 20), names, values));
    values = new double[] { 1.734, 4.5, 0.5123, 5.5 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 20), names, values));
    values = new double[] { negate * 1, 0.25, 5, 10.0 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 50), names, values));
    values = new double[] { negate * 1.4, 0.55, 5.65, 12.0 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 50), names, values));
    values = new double[] { negate * 1.9, 0.25, 5.9, 15 };
    train.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 50), names, values));
    DataSourceProvenance testProvenance = new SimpleDataSourceProvenance("TestingData", OffsetDateTime.now(), REGRESSION_FACTORY);
    MutableDataset<Regressor> test = new MutableDataset<>(testProvenance, REGRESSION_FACTORY);
    values = new double[] { 2.0, 0.45, 3.5, negate * 2.0 };
    test.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 5.1), names, values));
    values = new double[] { negate * 2.0, 0.55, negate * 2.5, 2.5 };
    test.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 10.0), names, values));
    values = new double[] { 1.75, 5.0, 1.0, 6.5 };
    test.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 20), names, values));
    values = new double[] { negate * 1.5, 0.25, 5.0, 20.0 };
    test.add(new ArrayExample<>(new Regressor(SINGLE_DIM_NAME, 50), names, values));
    return new Pair<>(train, test);
}
Also used : SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance) Regressor(org.tribuo.regression.Regressor) MutableDataset(org.tribuo.MutableDataset) DataSourceProvenance(org.tribuo.provenance.DataSourceProvenance) SimpleDataSourceProvenance(org.tribuo.provenance.SimpleDataSourceProvenance) Pair(com.oracle.labs.mlrg.olcut.util.Pair)

Aggregations

MutableDataset (org.tribuo.MutableDataset)7 DataSourceProvenance (org.tribuo.provenance.DataSourceProvenance)7 SimpleDataSourceProvenance (org.tribuo.provenance.SimpleDataSourceProvenance)7 Pair (com.oracle.labs.mlrg.olcut.util.Pair)5 Label (org.tribuo.classification.Label)3 Feature (org.tribuo.Feature)2 ArrayExample (org.tribuo.impl.ArrayExample)2 MultiLabel (org.tribuo.multilabel.MultiLabel)2 Regressor (org.tribuo.regression.Regressor)2