Search in sources :

Example 1 with BucketMeta

use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.

the class GiniFeatureHistogramTest method testOfSums.

/**
 */
@Test
public void testOfSums() {
    int sampleId = 0;
    BucketMeta bucketMeta1 = new BucketMeta(new FeatureMeta("", 0, false));
    bucketMeta1.setMinVal(0.);
    bucketMeta1.setBucketSize(0.1);
    BucketMeta bucketMeta2 = new BucketMeta(new FeatureMeta("", 1, true));
    GiniHistogram forAllHist1 = new GiniHistogram(sampleId, lblMapping, bucketMeta1);
    GiniHistogram forAllHist2 = new GiniHistogram(sampleId, lblMapping, bucketMeta2);
    List<GiniHistogram> partitions1 = new ArrayList<>();
    List<GiniHistogram> partitions2 = new ArrayList<>();
    int cntOfPartitions = rnd.nextInt(1000);
    for (int i = 0; i < cntOfPartitions; i++) {
        partitions1.add(new GiniHistogram(sampleId, lblMapping, bucketMeta1));
        partitions2.add(new GiniHistogram(sampleId, lblMapping, bucketMeta2));
    }
    int datasetSize = rnd.nextInt(10000);
    for (int i = 0; i < datasetSize; i++) {
        BootstrappedVector vec = randomVector(true);
        vec.features().set(1, (vec.features().get(1) * 100) % 100);
        forAllHist1.addElement(vec);
        forAllHist2.addElement(vec);
        int partId = rnd.nextInt(cntOfPartitions);
        partitions1.get(partId).addElement(vec);
        partitions2.get(partId).addElement(vec);
    }
    checkSums(forAllHist1, partitions1);
    checkSums(forAllHist2, partitions2);
    GiniHistogram emptyHist1 = new GiniHistogram(sampleId, lblMapping, bucketMeta1);
    GiniHistogram emptyHist2 = new GiniHistogram(sampleId, lblMapping, bucketMeta2);
    assertTrue(forAllHist1.isEqualTo(forAllHist1.plus(emptyHist1)));
    assertTrue(forAllHist2.isEqualTo(forAllHist2.plus(emptyHist2)));
    assertTrue(forAllHist1.isEqualTo(emptyHist1.plus(forAllHist1)));
    assertTrue(forAllHist2.isEqualTo(emptyHist2.plus(forAllHist2)));
}
Also used : FeatureMeta(org.apache.ignite.ml.dataset.feature.FeatureMeta) ArrayList(java.util.ArrayList) BucketMeta(org.apache.ignite.ml.dataset.feature.BucketMeta) BootstrappedVector(org.apache.ignite.ml.dataset.impl.bootstrapping.BootstrappedVector) Test(org.junit.Test)

Example 2 with BucketMeta

use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.

the class MSEHistogramTest method testOfSums.

/**
 */
@Test
public void testOfSums() {
    int sampleId = 0;
    BucketMeta bucketMeta1 = new BucketMeta(new FeatureMeta("", 0, false));
    bucketMeta1.setMinVal(0.);
    bucketMeta1.setBucketSize(0.1);
    BucketMeta bucketMeta2 = new BucketMeta(new FeatureMeta("", 1, true));
    MSEHistogram forAllHist1 = new MSEHistogram(sampleId, bucketMeta1);
    MSEHistogram forAllHist2 = new MSEHistogram(sampleId, bucketMeta2);
    List<MSEHistogram> partitions1 = new ArrayList<>();
    List<MSEHistogram> partitions2 = new ArrayList<>();
    int cntOfPartitions = rnd.nextInt(100) + 1;
    for (int i = 0; i < cntOfPartitions; i++) {
        partitions1.add(new MSEHistogram(sampleId, bucketMeta1));
        partitions2.add(new MSEHistogram(sampleId, bucketMeta2));
    }
    int datasetSize = rnd.nextInt(1000) + 1;
    for (int i = 0; i < datasetSize; i++) {
        BootstrappedVector vec = randomVector(false);
        vec.features().set(1, (vec.features().get(1) * 100) % 100);
        forAllHist1.addElement(vec);
        forAllHist2.addElement(vec);
        int partId = rnd.nextInt(cntOfPartitions);
        partitions1.get(partId).addElement(vec);
        partitions2.get(partId).addElement(vec);
    }
    checkSums(forAllHist1, partitions1);
    checkSums(forAllHist2, partitions2);
    MSEHistogram emptyHist1 = new MSEHistogram(sampleId, bucketMeta1);
    MSEHistogram emptyHist2 = new MSEHistogram(sampleId, bucketMeta2);
    assertTrue(forAllHist1.isEqualTo(forAllHist1.plus(emptyHist1)));
    assertTrue(forAllHist2.isEqualTo(forAllHist2.plus(emptyHist2)));
    assertTrue(forAllHist1.isEqualTo(emptyHist1.plus(forAllHist1)));
    assertTrue(forAllHist2.isEqualTo(emptyHist2.plus(forAllHist2)));
}
Also used : FeatureMeta(org.apache.ignite.ml.dataset.feature.FeatureMeta) ArrayList(java.util.ArrayList) BucketMeta(org.apache.ignite.ml.dataset.feature.BucketMeta) BootstrappedVector(org.apache.ignite.ml.dataset.impl.bootstrapping.BootstrappedVector) Test(org.junit.Test)

Example 3 with BucketMeta

use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.

the class RandomForestTrainer method computeHistogramMeta.

/**
 * Compute bucket metas based on feature metas and learning dataset.
 *
 * @param meta    Features meta.
 * @param dataset Dataset.
 * @return bucket metas.
 */
private Map<Integer, BucketMeta> computeHistogramMeta(List<FeatureMeta> meta, Dataset<EmptyContext, BootstrappedDatasetPartition> dataset) {
    List<NormalDistributionStatistics> stats = new NormalDistributionStatisticsComputer().computeStatistics(meta, dataset);
    if (stats == null)
        return Collections.emptyMap();
    Map<Integer, BucketMeta> bucketsMeta = new HashMap<>();
    for (int i = 0; i < stats.size(); i++) {
        BucketMeta bucketMeta = new BucketMeta(meta.get(i));
        if (!bucketMeta.getFeatureMeta().isCategoricalFeature()) {
            NormalDistributionStatistics stat = stats.get(i);
            bucketMeta.setMinVal(stat.min());
            bucketMeta.setBucketSize(stat.std() * BUCKET_SIZE_FACTOR);
        }
        bucketsMeta.put(i, bucketMeta);
    }
    return bucketsMeta;
}
Also used : HashMap(java.util.HashMap) NormalDistributionStatisticsComputer(org.apache.ignite.ml.tree.randomforest.data.statistics.NormalDistributionStatisticsComputer) NormalDistributionStatistics(org.apache.ignite.ml.tree.randomforest.data.statistics.NormalDistributionStatistics) BucketMeta(org.apache.ignite.ml.dataset.feature.BucketMeta)

Example 4 with BucketMeta

use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.

the class RandomForestTrainer method fit.

/**
 * Trains model based on the specified data.
 *
 * @param dataset Dataset.
 * @return list of decision trees.
 */
private List<RandomForestTreeModel> fit(Dataset<EmptyContext, BootstrappedDatasetPartition> dataset) {
    Queue<TreeNode> treesQueue = createRootsQueue();
    ArrayList<RandomForestTreeModel> roots = initTrees(treesQueue);
    Map<Integer, BucketMeta> histMeta = computeHistogramMeta(meta, dataset);
    if (histMeta.isEmpty())
        return Collections.emptyList();
    ImpurityHistogramsComputer<S> histogramsComputer = createImpurityHistogramsComputer();
    while (!treesQueue.isEmpty()) {
        Map<NodeId, TreeNode> nodesToLearn = getNodesToLearn(treesQueue);
        Map<NodeId, ImpurityHistogramsComputer.NodeImpurityHistograms<S>> nodesImpHists = histogramsComputer.aggregateImpurityStatistics(roots, histMeta, nodesToLearn, dataset);
        if (nodesToLearn.size() != nodesImpHists.size())
            throw new IllegalStateException();
        for (NodeId nodeId : nodesImpHists.keySet()) split(treesQueue, nodesToLearn, nodesImpHists.get(nodeId));
    }
    createLeafStatisticsAggregator().setValuesForLeaves(roots, dataset);
    return roots;
}
Also used : BucketMeta(org.apache.ignite.ml.dataset.feature.BucketMeta) TreeNode(org.apache.ignite.ml.tree.randomforest.data.TreeNode) RandomForestTreeModel(org.apache.ignite.ml.tree.randomforest.data.RandomForestTreeModel) NodeId(org.apache.ignite.ml.tree.randomforest.data.NodeId)

Example 5 with BucketMeta

use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.

the class ImpurityHistogramsComputer method aggregateImpurityStatisticsOnPartition.

/**
 * Aggregates statistics for impurity computing for each corner nodes for each trees in random forest. This
 * algorithm predict corner node in decision tree for learning vector and stocks it to correspond histogram.
 *
 * @param dataset Dataset.
 * @param roots Trees.
 * @param histMeta Histogram buckets meta.
 * @param part Partition.
 * @return Leaf statistics for impurity computing.
 */
private Map<NodeId, NodeImpurityHistograms<S>> aggregateImpurityStatisticsOnPartition(BootstrappedDatasetPartition dataset, ArrayList<RandomForestTreeModel> roots, Map<Integer, BucketMeta> histMeta, Map<NodeId, TreeNode> part) {
    Map<NodeId, NodeImpurityHistograms<S>> res = part.keySet().stream().collect(Collectors.toMap(n -> n, NodeImpurityHistograms::new));
    dataset.forEach(vector -> {
        for (int sampleId = 0; sampleId < vector.counters().length; sampleId++) {
            if (vector.counters()[sampleId] == 0)
                continue;
            RandomForestTreeModel root = roots.get(sampleId);
            NodeId key = root.getRootNode().predictNextNodeKey(vector.features());
            if (// if we didn't take all nodes from learning queue
            !part.containsKey(key))
                continue;
            NodeImpurityHistograms<S> statistics = res.get(key);
            for (Integer featureId : root.getUsedFeatures()) {
                BucketMeta meta = histMeta.get(featureId);
                if (!statistics.perFeatureStatistics.containsKey(featureId))
                    statistics.perFeatureStatistics.put(featureId, createImpurityComputerForFeature(sampleId, meta));
                S impurityComputer = statistics.perFeatureStatistics.get(featureId);
                impurityComputer.addElement(vector);
            }
        }
    });
    return res;
}
Also used : TreeNode(org.apache.ignite.ml.tree.randomforest.data.TreeNode) BootstrappedVector(org.apache.ignite.ml.dataset.impl.bootstrapping.BootstrappedVector) NodeSplit(org.apache.ignite.ml.tree.randomforest.data.NodeSplit) HashMap(java.util.HashMap) BootstrappedDatasetPartition(org.apache.ignite.ml.dataset.impl.bootstrapping.BootstrappedDatasetPartition) Collectors(java.util.stream.Collectors) NodeId(org.apache.ignite.ml.tree.randomforest.data.NodeId) Serializable(java.io.Serializable) ArrayList(java.util.ArrayList) BucketMeta(org.apache.ignite.ml.dataset.feature.BucketMeta) Stream(java.util.stream.Stream) Dataset(org.apache.ignite.ml.dataset.Dataset) Map(java.util.Map) Optional(java.util.Optional) RandomForestTreeModel(org.apache.ignite.ml.tree.randomforest.data.RandomForestTreeModel) Comparator(java.util.Comparator) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) RandomForestTreeModel(org.apache.ignite.ml.tree.randomforest.data.RandomForestTreeModel) NodeId(org.apache.ignite.ml.tree.randomforest.data.NodeId) BucketMeta(org.apache.ignite.ml.dataset.feature.BucketMeta)

Aggregations

BucketMeta (org.apache.ignite.ml.dataset.feature.BucketMeta)5 ArrayList (java.util.ArrayList)3 BootstrappedVector (org.apache.ignite.ml.dataset.impl.bootstrapping.BootstrappedVector)3 HashMap (java.util.HashMap)2 FeatureMeta (org.apache.ignite.ml.dataset.feature.FeatureMeta)2 NodeId (org.apache.ignite.ml.tree.randomforest.data.NodeId)2 RandomForestTreeModel (org.apache.ignite.ml.tree.randomforest.data.RandomForestTreeModel)2 TreeNode (org.apache.ignite.ml.tree.randomforest.data.TreeNode)2 Test (org.junit.Test)2 Serializable (java.io.Serializable)1 Comparator (java.util.Comparator)1 Map (java.util.Map)1 Optional (java.util.Optional)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 Dataset (org.apache.ignite.ml.dataset.Dataset)1 BootstrappedDatasetPartition (org.apache.ignite.ml.dataset.impl.bootstrapping.BootstrappedDatasetPartition)1 EmptyContext (org.apache.ignite.ml.dataset.primitive.context.EmptyContext)1 NodeSplit (org.apache.ignite.ml.tree.randomforest.data.NodeSplit)1 NormalDistributionStatistics (org.apache.ignite.ml.tree.randomforest.data.statistics.NormalDistributionStatistics)1