use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.
the class GiniFeatureHistogramTest method testOfSums.
/**
*/
@Test
public void testOfSums() {
int sampleId = 0;
BucketMeta bucketMeta1 = new BucketMeta(new FeatureMeta("", 0, false));
bucketMeta1.setMinVal(0.);
bucketMeta1.setBucketSize(0.1);
BucketMeta bucketMeta2 = new BucketMeta(new FeatureMeta("", 1, true));
GiniHistogram forAllHist1 = new GiniHistogram(sampleId, lblMapping, bucketMeta1);
GiniHistogram forAllHist2 = new GiniHistogram(sampleId, lblMapping, bucketMeta2);
List<GiniHistogram> partitions1 = new ArrayList<>();
List<GiniHistogram> partitions2 = new ArrayList<>();
int cntOfPartitions = rnd.nextInt(1000);
for (int i = 0; i < cntOfPartitions; i++) {
partitions1.add(new GiniHistogram(sampleId, lblMapping, bucketMeta1));
partitions2.add(new GiniHistogram(sampleId, lblMapping, bucketMeta2));
}
int datasetSize = rnd.nextInt(10000);
for (int i = 0; i < datasetSize; i++) {
BootstrappedVector vec = randomVector(true);
vec.features().set(1, (vec.features().get(1) * 100) % 100);
forAllHist1.addElement(vec);
forAllHist2.addElement(vec);
int partId = rnd.nextInt(cntOfPartitions);
partitions1.get(partId).addElement(vec);
partitions2.get(partId).addElement(vec);
}
checkSums(forAllHist1, partitions1);
checkSums(forAllHist2, partitions2);
GiniHistogram emptyHist1 = new GiniHistogram(sampleId, lblMapping, bucketMeta1);
GiniHistogram emptyHist2 = new GiniHistogram(sampleId, lblMapping, bucketMeta2);
assertTrue(forAllHist1.isEqualTo(forAllHist1.plus(emptyHist1)));
assertTrue(forAllHist2.isEqualTo(forAllHist2.plus(emptyHist2)));
assertTrue(forAllHist1.isEqualTo(emptyHist1.plus(forAllHist1)));
assertTrue(forAllHist2.isEqualTo(emptyHist2.plus(forAllHist2)));
}
use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.
the class MSEHistogramTest method testOfSums.
/**
*/
@Test
public void testOfSums() {
int sampleId = 0;
BucketMeta bucketMeta1 = new BucketMeta(new FeatureMeta("", 0, false));
bucketMeta1.setMinVal(0.);
bucketMeta1.setBucketSize(0.1);
BucketMeta bucketMeta2 = new BucketMeta(new FeatureMeta("", 1, true));
MSEHistogram forAllHist1 = new MSEHistogram(sampleId, bucketMeta1);
MSEHistogram forAllHist2 = new MSEHistogram(sampleId, bucketMeta2);
List<MSEHistogram> partitions1 = new ArrayList<>();
List<MSEHistogram> partitions2 = new ArrayList<>();
int cntOfPartitions = rnd.nextInt(100) + 1;
for (int i = 0; i < cntOfPartitions; i++) {
partitions1.add(new MSEHistogram(sampleId, bucketMeta1));
partitions2.add(new MSEHistogram(sampleId, bucketMeta2));
}
int datasetSize = rnd.nextInt(1000) + 1;
for (int i = 0; i < datasetSize; i++) {
BootstrappedVector vec = randomVector(false);
vec.features().set(1, (vec.features().get(1) * 100) % 100);
forAllHist1.addElement(vec);
forAllHist2.addElement(vec);
int partId = rnd.nextInt(cntOfPartitions);
partitions1.get(partId).addElement(vec);
partitions2.get(partId).addElement(vec);
}
checkSums(forAllHist1, partitions1);
checkSums(forAllHist2, partitions2);
MSEHistogram emptyHist1 = new MSEHistogram(sampleId, bucketMeta1);
MSEHistogram emptyHist2 = new MSEHistogram(sampleId, bucketMeta2);
assertTrue(forAllHist1.isEqualTo(forAllHist1.plus(emptyHist1)));
assertTrue(forAllHist2.isEqualTo(forAllHist2.plus(emptyHist2)));
assertTrue(forAllHist1.isEqualTo(emptyHist1.plus(forAllHist1)));
assertTrue(forAllHist2.isEqualTo(emptyHist2.plus(forAllHist2)));
}
use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.
the class RandomForestTrainer method computeHistogramMeta.
/**
* Compute bucket metas based on feature metas and learning dataset.
*
* @param meta Features meta.
* @param dataset Dataset.
* @return bucket metas.
*/
private Map<Integer, BucketMeta> computeHistogramMeta(List<FeatureMeta> meta, Dataset<EmptyContext, BootstrappedDatasetPartition> dataset) {
List<NormalDistributionStatistics> stats = new NormalDistributionStatisticsComputer().computeStatistics(meta, dataset);
if (stats == null)
return Collections.emptyMap();
Map<Integer, BucketMeta> bucketsMeta = new HashMap<>();
for (int i = 0; i < stats.size(); i++) {
BucketMeta bucketMeta = new BucketMeta(meta.get(i));
if (!bucketMeta.getFeatureMeta().isCategoricalFeature()) {
NormalDistributionStatistics stat = stats.get(i);
bucketMeta.setMinVal(stat.min());
bucketMeta.setBucketSize(stat.std() * BUCKET_SIZE_FACTOR);
}
bucketsMeta.put(i, bucketMeta);
}
return bucketsMeta;
}
use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.
the class RandomForestTrainer method fit.
/**
* Trains model based on the specified data.
*
* @param dataset Dataset.
* @return list of decision trees.
*/
private List<RandomForestTreeModel> fit(Dataset<EmptyContext, BootstrappedDatasetPartition> dataset) {
Queue<TreeNode> treesQueue = createRootsQueue();
ArrayList<RandomForestTreeModel> roots = initTrees(treesQueue);
Map<Integer, BucketMeta> histMeta = computeHistogramMeta(meta, dataset);
if (histMeta.isEmpty())
return Collections.emptyList();
ImpurityHistogramsComputer<S> histogramsComputer = createImpurityHistogramsComputer();
while (!treesQueue.isEmpty()) {
Map<NodeId, TreeNode> nodesToLearn = getNodesToLearn(treesQueue);
Map<NodeId, ImpurityHistogramsComputer.NodeImpurityHistograms<S>> nodesImpHists = histogramsComputer.aggregateImpurityStatistics(roots, histMeta, nodesToLearn, dataset);
if (nodesToLearn.size() != nodesImpHists.size())
throw new IllegalStateException();
for (NodeId nodeId : nodesImpHists.keySet()) split(treesQueue, nodesToLearn, nodesImpHists.get(nodeId));
}
createLeafStatisticsAggregator().setValuesForLeaves(roots, dataset);
return roots;
}
use of org.apache.ignite.ml.dataset.feature.BucketMeta in project ignite by apache.
the class ImpurityHistogramsComputer method aggregateImpurityStatisticsOnPartition.
/**
* Aggregates statistics for impurity computing for each corner nodes for each trees in random forest. This
* algorithm predict corner node in decision tree for learning vector and stocks it to correspond histogram.
*
* @param dataset Dataset.
* @param roots Trees.
* @param histMeta Histogram buckets meta.
* @param part Partition.
* @return Leaf statistics for impurity computing.
*/
private Map<NodeId, NodeImpurityHistograms<S>> aggregateImpurityStatisticsOnPartition(BootstrappedDatasetPartition dataset, ArrayList<RandomForestTreeModel> roots, Map<Integer, BucketMeta> histMeta, Map<NodeId, TreeNode> part) {
Map<NodeId, NodeImpurityHistograms<S>> res = part.keySet().stream().collect(Collectors.toMap(n -> n, NodeImpurityHistograms::new));
dataset.forEach(vector -> {
for (int sampleId = 0; sampleId < vector.counters().length; sampleId++) {
if (vector.counters()[sampleId] == 0)
continue;
RandomForestTreeModel root = roots.get(sampleId);
NodeId key = root.getRootNode().predictNextNodeKey(vector.features());
if (// if we didn't take all nodes from learning queue
!part.containsKey(key))
continue;
NodeImpurityHistograms<S> statistics = res.get(key);
for (Integer featureId : root.getUsedFeatures()) {
BucketMeta meta = histMeta.get(featureId);
if (!statistics.perFeatureStatistics.containsKey(featureId))
statistics.perFeatureStatistics.put(featureId, createImpurityComputerForFeature(sampleId, meta));
S impurityComputer = statistics.perFeatureStatistics.get(featureId);
impurityComputer.addElement(vector);
}
}
});
return res;
}
Aggregations