use of org.apache.ignite.ml.dataset.Dataset in project ignite by apache.
the class GaussianNaiveBayesTrainer method updateModel.
/**
* {@inheritDoc}
*/
@Override
protected <K, V> GaussianNaiveBayesModel updateModel(GaussianNaiveBayesModel mdl, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> extractor) {
assert datasetBuilder != null;
try (Dataset<EmptyContext, GaussianNaiveBayesSumsHolder> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), (env, upstream, upstreamSize, ctx) -> {
GaussianNaiveBayesSumsHolder res = new GaussianNaiveBayesSumsHolder();
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
LabeledVector lv = extractor.apply(entity.getKey(), entity.getValue());
Vector features = lv.features();
Double label = (Double) lv.label();
double[] toMeans;
double[] sqSum;
if (!res.featureSumsPerLbl.containsKey(label)) {
toMeans = new double[features.size()];
Arrays.fill(toMeans, 0.);
res.featureSumsPerLbl.put(label, toMeans);
}
if (!res.featureSquaredSumsPerLbl.containsKey(label)) {
sqSum = new double[features.size()];
res.featureSquaredSumsPerLbl.put(label, sqSum);
}
if (!res.featureCountersPerLbl.containsKey(label))
res.featureCountersPerLbl.put(label, 0);
res.featureCountersPerLbl.put(label, res.featureCountersPerLbl.get(label) + 1);
toMeans = res.featureSumsPerLbl.get(label);
sqSum = res.featureSquaredSumsPerLbl.get(label);
for (int j = 0; j < features.size(); j++) {
double x = features.get(j);
toMeans[j] += x;
sqSum[j] += x * x;
}
}
return res;
}, learningEnvironment())) {
GaussianNaiveBayesSumsHolder sumsHolder = dataset.compute(t -> t, (a, b) -> {
if (a == null)
return b;
if (b == null)
return a;
return a.merge(b);
});
if (mdl != null && mdl.getSumsHolder() != null)
sumsHolder = sumsHolder.merge(mdl.getSumsHolder());
List<Double> sortedLabels = new ArrayList<>(sumsHolder.featureCountersPerLbl.keySet());
sortedLabels.sort(Double::compareTo);
assert !sortedLabels.isEmpty() : "The dataset should contain at least one feature";
int labelCount = sortedLabels.size();
int featureCount = sumsHolder.featureSumsPerLbl.get(sortedLabels.get(0)).length;
double[][] means = new double[labelCount][featureCount];
double[][] variances = new double[labelCount][featureCount];
double[] classProbabilities = new double[labelCount];
double[] labels = new double[labelCount];
long datasetSize = sumsHolder.featureCountersPerLbl.values().stream().mapToInt(i -> i).sum();
int lbl = 0;
for (Double label : sortedLabels) {
int count = sumsHolder.featureCountersPerLbl.get(label);
double[] sum = sumsHolder.featureSumsPerLbl.get(label);
double[] sqSum = sumsHolder.featureSquaredSumsPerLbl.get(label);
for (int i = 0; i < featureCount; i++) {
means[lbl][i] = sum[i] / count;
variances[lbl][i] = (sqSum[i] - sum[i] * sum[i] / count) / count;
}
if (equiprobableClasses)
classProbabilities[lbl] = 1. / labelCount;
else if (priorProbabilities != null) {
assert classProbabilities.length == priorProbabilities.length;
classProbabilities[lbl] = priorProbabilities[lbl];
} else
classProbabilities[lbl] = (double) count / datasetSize;
labels[lbl] = label;
++lbl;
}
return new GaussianNaiveBayesModel(means, variances, classProbabilities, labels, sumsHolder);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.ignite.ml.dataset.Dataset in project ignite by apache.
the class DiscreteNaiveBayesTrainer method updateModel.
/**
* {@inheritDoc}
*/
@Override
protected <K, V> DiscreteNaiveBayesModel updateModel(DiscreteNaiveBayesModel mdl, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> extractor) {
try (Dataset<EmptyContext, DiscreteNaiveBayesSumsHolder> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), (env, upstream, upstreamSize, ctx) -> {
DiscreteNaiveBayesSumsHolder res = new DiscreteNaiveBayesSumsHolder();
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
LabeledVector lv = extractor.apply(entity.getKey(), entity.getValue());
Vector features = lv.features();
Double lb = (Double) lv.label();
long[][] valuesInBucket;
int size = features.size();
if (!res.valuesInBucketPerLbl.containsKey(lb)) {
valuesInBucket = new long[size][];
for (int i = 0; i < size; i++) {
valuesInBucket[i] = new long[bucketThresholds[i].length + 1];
Arrays.fill(valuesInBucket[i], 0L);
}
res.valuesInBucketPerLbl.put(lb, valuesInBucket);
}
if (!res.featureCountersPerLbl.containsKey(lb))
res.featureCountersPerLbl.put(lb, 0);
res.featureCountersPerLbl.put(lb, res.featureCountersPerLbl.get(lb) + 1);
valuesInBucket = res.valuesInBucketPerLbl.get(lb);
for (int j = 0; j < size; j++) {
double x = features.get(j);
int bucketNum = toBucketNumber(x, bucketThresholds[j]);
valuesInBucket[j][bucketNum] += 1;
}
}
return res;
}, learningEnvironment())) {
DiscreteNaiveBayesSumsHolder sumsHolder = dataset.compute(t -> t, (a, b) -> {
if (a == null)
return b;
if (b == null)
return a;
return a.merge(b);
});
if (mdl != null && isUpdateable(mdl)) {
if (checkSumsHolder(sumsHolder, mdl.getSumsHolder()))
sumsHolder = sumsHolder.merge(mdl.getSumsHolder());
}
List<Double> sortedLabels = new ArrayList<>(sumsHolder.featureCountersPerLbl.keySet());
sortedLabels.sort(Double::compareTo);
assert !sortedLabels.isEmpty() : "The dataset should contain at least one feature";
int lbCnt = sortedLabels.size();
int featureCnt = sumsHolder.valuesInBucketPerLbl.get(sortedLabels.get(0)).length;
double[][][] probabilities = new double[lbCnt][featureCnt][];
double[] classProbabilities = new double[lbCnt];
double[] labels = new double[lbCnt];
long datasetSize = sumsHolder.featureCountersPerLbl.values().stream().mapToInt(i -> i).sum();
int lbl = 0;
for (Double label : sortedLabels) {
int cnt = sumsHolder.featureCountersPerLbl.get(label);
long[][] sum = sumsHolder.valuesInBucketPerLbl.get(label);
for (int i = 0; i < featureCnt; i++) {
int bucketsCnt = sum[i].length;
probabilities[lbl][i] = new double[bucketsCnt];
for (int j = 0; j < bucketsCnt; j++) probabilities[lbl][i][j] = (double) sum[i][j] / cnt;
}
if (equiprobableClasses)
classProbabilities[lbl] = 1. / lbCnt;
else if (priorProbabilities != null) {
assert classProbabilities.length == priorProbabilities.length;
classProbabilities[lbl] = priorProbabilities[lbl];
} else
classProbabilities[lbl] = (double) cnt / datasetSize;
labels[lbl] = label;
++lbl;
}
return new DiscreteNaiveBayesModel(probabilities, classProbabilities, labels, bucketThresholds, sumsHolder);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.ignite.ml.dataset.Dataset in project ignite by apache.
the class MinMaxScalerTrainer method fit.
/**
* {@inheritDoc}
*/
@Override
public MinMaxScalerPreprocessor<K, V> fit(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> basePreprocessor) {
PartitionContextBuilder<K, V, EmptyContext> ctxBuilder = (env, upstream, upstreamSize) -> new EmptyContext();
try (Dataset<EmptyContext, MinMaxScalerPartitionData> dataset = datasetBuilder.build(envBuilder, ctxBuilder, (env, upstream, upstreamSize, ctx) -> {
double[] min = null;
double[] max = null;
while (upstream.hasNext()) {
UpstreamEntry<K, V> entity = upstream.next();
LabeledVector row = basePreprocessor.apply(entity.getKey(), entity.getValue());
if (min == null) {
min = new double[row.size()];
Arrays.fill(min, Double.MAX_VALUE);
} else
assert min.length == row.size() : "Base preprocessor must return exactly " + min.length + " features";
if (max == null) {
max = new double[row.size()];
Arrays.fill(max, -Double.MAX_VALUE);
} else
assert max.length == row.size() : "Base preprocessor must return exactly " + min.length + " features";
for (int i = 0; i < row.size(); i++) {
if (row.get(i) < min[i])
min[i] = row.get(i);
if (row.get(i) > max[i])
max[i] = row.get(i);
}
}
return new MinMaxScalerPartitionData(min, max);
}, learningEnvironment(basePreprocessor))) {
double[][] minMax = dataset.compute(data -> data.getMin() != null ? new double[][] { data.getMin(), data.getMax() } : null, (a, b) -> {
if (a == null)
return b;
if (b == null)
return a;
double[][] res = new double[2][];
res[0] = new double[a[0].length];
for (int i = 0; i < res[0].length; i++) res[0][i] = Math.min(a[0][i], b[0][i]);
res[1] = new double[a[1].length];
for (int i = 0; i < res[1].length; i++) res[1][i] = Math.max(a[1][i], b[1][i]);
return res;
});
return new MinMaxScalerPreprocessor<>(minMax[0], minMax[1], basePreprocessor);
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.ignite.ml.dataset.Dataset in project ignite by apache.
the class AlgorithmSpecificDatasetExample method main.
/**
* Run example.
*/
public static void main(String[] args) throws Exception {
try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
System.out.println(">>> Algorithm Specific Dataset example started.");
IgniteCache<Integer, Vector> persons = null;
try {
persons = createCache(ignite);
Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<>(1);
IgniteFunction<LabeledVector<Double>, LabeledVector<double[]>> func = lv -> new LabeledVector<>(lv.features(), new double[] { lv.label() });
// NOTE: This class is part of Developer API and all lambdas should be loaded on server manually.
Preprocessor<Integer, Vector> preprocessor = new PatchedPreprocessor<>(func, vectorizer);
// Creates a algorithm specific dataset to perform linear regression. Here we define the way features and
// labels are extracted, and partition data and context are created.
SimpleLabeledDatasetDataBuilder<Integer, Vector, AlgorithmSpecificPartitionContext> builder = new SimpleLabeledDatasetDataBuilder<>(preprocessor);
IgniteBiFunction<SimpleLabeledDatasetData, AlgorithmSpecificPartitionContext, SimpleLabeledDatasetData> builderFun = (data, ctx) -> {
double[] features = data.getFeatures();
int rows = data.getRows();
// Makes a copy of features to supplement it by columns with values equal to 1.0.
double[] a = new double[features.length + rows];
Arrays.fill(a, 1.0);
System.arraycopy(features, 0, a, rows, features.length);
return new SimpleLabeledDatasetData(a, data.getLabels(), rows);
};
try (AlgorithmSpecificDataset dataset = DatasetFactory.create(ignite, persons, (env, upstream, upstreamSize) -> new AlgorithmSpecificPartitionContext(), builder.andThen(builderFun)).wrap(AlgorithmSpecificDataset::new)) {
// Trains linear regression model using gradient descent.
double[] linearRegressionMdl = new double[2];
for (int i = 0; i < 1000; i++) {
double[] gradient = dataset.gradient(linearRegressionMdl);
if (BLAS.getInstance().dnrm2(gradient.length, gradient, 1) < 1e-4)
break;
for (int j = 0; j < gradient.length; j++) linearRegressionMdl[j] -= 0.1 / persons.size() * gradient[j];
}
System.out.println("Linear Regression Model: " + Arrays.toString(linearRegressionMdl));
}
System.out.println(">>> Algorithm Specific Dataset example completed.");
} finally {
persons.destroy();
}
} finally {
System.out.flush();
}
}
use of org.apache.ignite.ml.dataset.Dataset in project ignite by apache.
the class LearningEnvironmentTest method testRandomNumbersGenerator.
/**
* Test random number generator provided by {@link LearningEnvironment}.
* We test that:
* 1. Correct random generator is returned for each partition.
* 2. Its state is saved between compute calls (for this we do several iterations of compute).
*/
@Test
public void testRandomNumbersGenerator() {
// We make such builders that provide as functions returning partition index * iteration as random number generator nextInt
LearningEnvironmentBuilder envBuilder = TestUtils.testEnvBuilder().withRandomDependency(MockRandom::new);
int partitions = 10;
int iterations = 2;
DatasetTrainer<IgniteModel<Object, Vector>, Void> trainer = new DatasetTrainer<IgniteModel<Object, Vector>, Void>() {
/**
* {@inheritDoc}
*/
@Override
public <K, V> IgniteModel<Object, Vector> fitWithInitializedDeployingContext(DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
Dataset<EmptyContext, TestUtils.DataWrapper<Integer>> ds = datasetBuilder.build(envBuilder, new EmptyContextBuilder<>(), (PartitionDataBuilder<K, V, EmptyContext, TestUtils.DataWrapper<Integer>>) (env, upstreamData, upstreamDataSize, ctx) -> TestUtils.DataWrapper.of(env.partition()), envBuilder.buildForTrainer());
Vector v = null;
for (int iter = 0; iter < iterations; iter++) {
v = ds.compute((dw, env) -> VectorUtils.fill(-1, partitions).set(env.partition(), env.randomNumbersGenerator().nextInt()), (v1, v2) -> zipOverridingEmpty(v1, v2, -1));
}
return constantModel(v);
}
/**
* {@inheritDoc}
*/
@Override
public boolean isUpdateable(IgniteModel<Object, Vector> mdl) {
return false;
}
/**
* {@inheritDoc}
*/
@Override
protected <K, V> IgniteModel<Object, Vector> updateModel(IgniteModel<Object, Vector> mdl, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
return null;
}
};
trainer.withEnvironmentBuilder(envBuilder);
IgniteModel<Object, Vector> mdl = trainer.fit(getCacheMock(partitions), partitions, null);
Vector exp = VectorUtils.zeroes(partitions);
for (int i = 0; i < partitions; i++) exp.set(i, i * iterations);
Vector res = mdl.predict(null);
assertEquals(exp, res);
}
Aggregations