Search in sources :

Example 1 with UpstreamEntry

use of org.apache.ignite.ml.dataset.UpstreamEntry in project ignite by apache.

the class EncoderTrainer method fit.

/**
 * {@inheritDoc}
 */
@Override
public EncoderPreprocessor<K, V> fit(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> basePreprocessor) {
    if (handledIndices.isEmpty() && encoderType != EncoderType.LABEL_ENCODER)
        throw new RuntimeException("Add indices of handled features");
    try (Dataset<EmptyContext, EncoderPartitionData> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), (env, upstream, upstreamSize, ctx) -> {
        EncoderPartitionData partData = new EncoderPartitionData();
        if (encoderType == EncoderType.LABEL_ENCODER) {
            Map<String, Integer> lbFrequencies = null;
            while (upstream.hasNext()) {
                UpstreamEntry<K, V> entity = upstream.next();
                LabeledVector<Double> row = basePreprocessor.apply(entity.getKey(), entity.getValue());
                lbFrequencies = updateLabelFrequenciesForNextRow(row, lbFrequencies);
            }
            partData.withLabelFrequencies(lbFrequencies);
        } else if (encoderType == EncoderType.TARGET_ENCODER) {
            TargetCounter[] targetCounter = null;
            while (upstream.hasNext()) {
                UpstreamEntry<K, V> entity = upstream.next();
                LabeledVector<Double> row = basePreprocessor.apply(entity.getKey(), entity.getValue());
                targetCounter = updateTargetCountersForNextRow(row, targetCounter);
            }
            partData.withTargetCounters(targetCounter);
        } else {
            // This array will contain not null values for handled indices
            Map<String, Integer>[] categoryFrequencies = null;
            while (upstream.hasNext()) {
                UpstreamEntry<K, V> entity = upstream.next();
                LabeledVector<Double> row = basePreprocessor.apply(entity.getKey(), entity.getValue());
                categoryFrequencies = updateFeatureFrequenciesForNextRow(row, categoryFrequencies);
            }
            partData.withCategoryFrequencies(categoryFrequencies);
        }
        return partData;
    }, learningEnvironment(basePreprocessor))) {
        switch(encoderType) {
            case ONE_HOT_ENCODER:
                return new OneHotEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices);
            case STRING_ENCODER:
                return new StringEncoderPreprocessor<>(calculateEncodingValuesByFrequencies(dataset), basePreprocessor, handledIndices);
            case LABEL_ENCODER:
                return new LabelEncoderPreprocessor<>(calculateEncodingValuesForLabelsByFrequencies(dataset), basePreprocessor);
            case FREQUENCY_ENCODER:
                return new FrequencyEncoderPreprocessor<>(calculateEncodingFrequencies(dataset), basePreprocessor, handledIndices);
            case TARGET_ENCODER:
                return new TargetEncoderPreprocessor<>(calculateTargetEncodingFrequencies(dataset), basePreprocessor, handledIndices);
            default:
                throw new IllegalStateException("Define the type of the resulting prerocessor.");
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) StringEncoderPreprocessor(org.apache.ignite.ml.preprocessing.encoding.stringencoder.StringEncoderPreprocessor) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) TargetEncoderPreprocessor(org.apache.ignite.ml.preprocessing.encoding.target.TargetEncoderPreprocessor) FrequencyEncoderPreprocessor(org.apache.ignite.ml.preprocessing.encoding.frequency.FrequencyEncoderPreprocessor) UndefinedLabelException(org.apache.ignite.ml.math.exceptions.preprocessing.UndefinedLabelException) OneHotEncoderPreprocessor(org.apache.ignite.ml.preprocessing.encoding.onehotencoder.OneHotEncoderPreprocessor) LabelEncoderPreprocessor(org.apache.ignite.ml.preprocessing.encoding.label.LabelEncoderPreprocessor) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry)

Example 2 with UpstreamEntry

use of org.apache.ignite.ml.dataset.UpstreamEntry in project ignite by apache.

the class GaussianNaiveBayesTrainer method updateModel.

/**
 * {@inheritDoc}
 */
@Override
protected <K, V> GaussianNaiveBayesModel updateModel(GaussianNaiveBayesModel mdl, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> extractor) {
    assert datasetBuilder != null;
    try (Dataset<EmptyContext, GaussianNaiveBayesSumsHolder> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), (env, upstream, upstreamSize, ctx) -> {
        GaussianNaiveBayesSumsHolder res = new GaussianNaiveBayesSumsHolder();
        while (upstream.hasNext()) {
            UpstreamEntry<K, V> entity = upstream.next();
            LabeledVector lv = extractor.apply(entity.getKey(), entity.getValue());
            Vector features = lv.features();
            Double label = (Double) lv.label();
            double[] toMeans;
            double[] sqSum;
            if (!res.featureSumsPerLbl.containsKey(label)) {
                toMeans = new double[features.size()];
                Arrays.fill(toMeans, 0.);
                res.featureSumsPerLbl.put(label, toMeans);
            }
            if (!res.featureSquaredSumsPerLbl.containsKey(label)) {
                sqSum = new double[features.size()];
                res.featureSquaredSumsPerLbl.put(label, sqSum);
            }
            if (!res.featureCountersPerLbl.containsKey(label))
                res.featureCountersPerLbl.put(label, 0);
            res.featureCountersPerLbl.put(label, res.featureCountersPerLbl.get(label) + 1);
            toMeans = res.featureSumsPerLbl.get(label);
            sqSum = res.featureSquaredSumsPerLbl.get(label);
            for (int j = 0; j < features.size(); j++) {
                double x = features.get(j);
                toMeans[j] += x;
                sqSum[j] += x * x;
            }
        }
        return res;
    }, learningEnvironment())) {
        GaussianNaiveBayesSumsHolder sumsHolder = dataset.compute(t -> t, (a, b) -> {
            if (a == null)
                return b;
            if (b == null)
                return a;
            return a.merge(b);
        });
        if (mdl != null && mdl.getSumsHolder() != null)
            sumsHolder = sumsHolder.merge(mdl.getSumsHolder());
        List<Double> sortedLabels = new ArrayList<>(sumsHolder.featureCountersPerLbl.keySet());
        sortedLabels.sort(Double::compareTo);
        assert !sortedLabels.isEmpty() : "The dataset should contain at least one feature";
        int labelCount = sortedLabels.size();
        int featureCount = sumsHolder.featureSumsPerLbl.get(sortedLabels.get(0)).length;
        double[][] means = new double[labelCount][featureCount];
        double[][] variances = new double[labelCount][featureCount];
        double[] classProbabilities = new double[labelCount];
        double[] labels = new double[labelCount];
        long datasetSize = sumsHolder.featureCountersPerLbl.values().stream().mapToInt(i -> i).sum();
        int lbl = 0;
        for (Double label : sortedLabels) {
            int count = sumsHolder.featureCountersPerLbl.get(label);
            double[] sum = sumsHolder.featureSumsPerLbl.get(label);
            double[] sqSum = sumsHolder.featureSquaredSumsPerLbl.get(label);
            for (int i = 0; i < featureCount; i++) {
                means[lbl][i] = sum[i] / count;
                variances[lbl][i] = (sqSum[i] - sum[i] * sum[i] / count) / count;
            }
            if (equiprobableClasses)
                classProbabilities[lbl] = 1. / labelCount;
            else if (priorProbabilities != null) {
                assert classProbabilities.length == priorProbabilities.length;
                classProbabilities[lbl] = priorProbabilities[lbl];
            } else
                classProbabilities[lbl] = (double) count / datasetSize;
            labels[lbl] = label;
            ++lbl;
        }
        return new GaussianNaiveBayesModel(means, variances, classProbabilities, labels, sumsHolder);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Arrays(java.util.Arrays) List(java.util.List) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) Dataset(org.apache.ignite.ml.dataset.Dataset) SingleLabelDatasetTrainer(org.apache.ignite.ml.trainers.SingleLabelDatasetTrainer) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) Preprocessor(org.apache.ignite.ml.preprocessing.Preprocessor) DatasetBuilder(org.apache.ignite.ml.dataset.DatasetBuilder) LearningEnvironmentBuilder(org.apache.ignite.ml.environment.LearningEnvironmentBuilder) ArrayList(java.util.ArrayList) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) ArrayList(java.util.ArrayList) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) Vector(org.apache.ignite.ml.math.primitives.vector.Vector)

Example 3 with UpstreamEntry

use of org.apache.ignite.ml.dataset.UpstreamEntry in project ignite by apache.

the class DiscreteNaiveBayesTrainer method updateModel.

/**
 * {@inheritDoc}
 */
@Override
protected <K, V> DiscreteNaiveBayesModel updateModel(DiscreteNaiveBayesModel mdl, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> extractor) {
    try (Dataset<EmptyContext, DiscreteNaiveBayesSumsHolder> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), (env, upstream, upstreamSize, ctx) -> {
        DiscreteNaiveBayesSumsHolder res = new DiscreteNaiveBayesSumsHolder();
        while (upstream.hasNext()) {
            UpstreamEntry<K, V> entity = upstream.next();
            LabeledVector lv = extractor.apply(entity.getKey(), entity.getValue());
            Vector features = lv.features();
            Double lb = (Double) lv.label();
            long[][] valuesInBucket;
            int size = features.size();
            if (!res.valuesInBucketPerLbl.containsKey(lb)) {
                valuesInBucket = new long[size][];
                for (int i = 0; i < size; i++) {
                    valuesInBucket[i] = new long[bucketThresholds[i].length + 1];
                    Arrays.fill(valuesInBucket[i], 0L);
                }
                res.valuesInBucketPerLbl.put(lb, valuesInBucket);
            }
            if (!res.featureCountersPerLbl.containsKey(lb))
                res.featureCountersPerLbl.put(lb, 0);
            res.featureCountersPerLbl.put(lb, res.featureCountersPerLbl.get(lb) + 1);
            valuesInBucket = res.valuesInBucketPerLbl.get(lb);
            for (int j = 0; j < size; j++) {
                double x = features.get(j);
                int bucketNum = toBucketNumber(x, bucketThresholds[j]);
                valuesInBucket[j][bucketNum] += 1;
            }
        }
        return res;
    }, learningEnvironment())) {
        DiscreteNaiveBayesSumsHolder sumsHolder = dataset.compute(t -> t, (a, b) -> {
            if (a == null)
                return b;
            if (b == null)
                return a;
            return a.merge(b);
        });
        if (mdl != null && isUpdateable(mdl)) {
            if (checkSumsHolder(sumsHolder, mdl.getSumsHolder()))
                sumsHolder = sumsHolder.merge(mdl.getSumsHolder());
        }
        List<Double> sortedLabels = new ArrayList<>(sumsHolder.featureCountersPerLbl.keySet());
        sortedLabels.sort(Double::compareTo);
        assert !sortedLabels.isEmpty() : "The dataset should contain at least one feature";
        int lbCnt = sortedLabels.size();
        int featureCnt = sumsHolder.valuesInBucketPerLbl.get(sortedLabels.get(0)).length;
        double[][][] probabilities = new double[lbCnt][featureCnt][];
        double[] classProbabilities = new double[lbCnt];
        double[] labels = new double[lbCnt];
        long datasetSize = sumsHolder.featureCountersPerLbl.values().stream().mapToInt(i -> i).sum();
        int lbl = 0;
        for (Double label : sortedLabels) {
            int cnt = sumsHolder.featureCountersPerLbl.get(label);
            long[][] sum = sumsHolder.valuesInBucketPerLbl.get(label);
            for (int i = 0; i < featureCnt; i++) {
                int bucketsCnt = sum[i].length;
                probabilities[lbl][i] = new double[bucketsCnt];
                for (int j = 0; j < bucketsCnt; j++) probabilities[lbl][i][j] = (double) sum[i][j] / cnt;
            }
            if (equiprobableClasses)
                classProbabilities[lbl] = 1. / lbCnt;
            else if (priorProbabilities != null) {
                assert classProbabilities.length == priorProbabilities.length;
                classProbabilities[lbl] = priorProbabilities[lbl];
            } else
                classProbabilities[lbl] = (double) cnt / datasetSize;
            labels[lbl] = label;
            ++lbl;
        }
        return new DiscreteNaiveBayesModel(probabilities, classProbabilities, labels, bucketThresholds, sumsHolder);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Arrays(java.util.Arrays) List(java.util.List) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) Dataset(org.apache.ignite.ml.dataset.Dataset) SingleLabelDatasetTrainer(org.apache.ignite.ml.trainers.SingleLabelDatasetTrainer) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) Preprocessor(org.apache.ignite.ml.preprocessing.Preprocessor) Optional(java.util.Optional) DatasetBuilder(org.apache.ignite.ml.dataset.DatasetBuilder) ArrayList(java.util.ArrayList) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) ArrayList(java.util.ArrayList) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) Vector(org.apache.ignite.ml.math.primitives.vector.Vector)

Example 4 with UpstreamEntry

use of org.apache.ignite.ml.dataset.UpstreamEntry in project ignite by apache.

the class ComputeUtilsTest method testGetData.

/**
 * Tests {@code getData()} method.
 */
@Test
public void testGetData() {
    ClusterNode node = grid(1).cluster().localNode();
    String upstreamCacheName = "CACHE_1_" + UUID.randomUUID();
    String datasetCacheName = "CACHE_2_" + UUID.randomUUID();
    CacheConfiguration<Integer, Integer> upstreamCacheConfiguration = new CacheConfiguration<>();
    upstreamCacheConfiguration.setName(upstreamCacheName);
    upstreamCacheConfiguration.setAffinity(new TestAffinityFunction(node));
    IgniteCache<Integer, Integer> upstreamCache = ignite.createCache(upstreamCacheConfiguration);
    CacheConfiguration<Integer, Integer> datasetCacheConfiguration = new CacheConfiguration<>();
    datasetCacheConfiguration.setName(datasetCacheName);
    datasetCacheConfiguration.setAffinity(new TestAffinityFunction(node));
    IgniteCache<Integer, Integer> datasetCache = ignite.createCache(datasetCacheConfiguration);
    upstreamCache.put(42, 42);
    datasetCache.put(0, 0);
    UUID datasetId = UUID.randomUUID();
    IgniteAtomicLong cnt = ignite.atomicLong("CNT_" + datasetId, 0, true);
    for (int i = 0; i < 10; i++) {
        Collection<TestPartitionData> data = ComputeUtils.affinityCallWithRetries(ignite, Arrays.asList(datasetCacheName, upstreamCacheName), part -> ComputeUtils.<Integer, Integer, Serializable, TestPartitionData>getData(ignite, upstreamCacheName, (k, v) -> true, UpstreamTransformerBuilder.identity(), datasetCacheName, datasetId, (env, upstream, upstreamSize, ctx) -> {
            cnt.incrementAndGet();
            assertEquals(1, upstreamSize);
            UpstreamEntry<Integer, Integer> e = upstream.next();
            return new TestPartitionData(e.getKey() + e.getValue());
        }, TestUtils.testEnvBuilder().buildForWorker(part), false), 0, DeployingContext.unitialized());
        assertEquals(1, data.size());
        TestPartitionData dataElement = data.iterator().next();
        assertEquals(84, dataElement.val.intValue());
    }
    assertEquals(1, cnt.get());
}
Also used : ClusterNode(org.apache.ignite.cluster.ClusterNode) Arrays(java.util.Arrays) DeployingContext(org.apache.ignite.ml.environment.deploy.DeployingContext) GridCommonAbstractTest(org.apache.ignite.testframework.junits.common.GridCommonAbstractTest) TestUtils(org.apache.ignite.ml.TestUtils) Collection(java.util.Collection) Test(org.junit.Test) UUID(java.util.UUID) Ignite(org.apache.ignite.Ignite) IgniteCache(org.apache.ignite.IgniteCache) Serializable(java.io.Serializable) List(java.util.List) Ignition(org.apache.ignite.Ignition) ClusterNode(org.apache.ignite.cluster.ClusterNode) CacheConfiguration(org.apache.ignite.configuration.CacheConfiguration) AffinityFunction(org.apache.ignite.cache.affinity.AffinityFunction) IgniteUtils(org.apache.ignite.internal.util.IgniteUtils) AffinityFunctionContext(org.apache.ignite.cache.affinity.AffinityFunctionContext) IgniteAtomicLong(org.apache.ignite.IgniteAtomicLong) UpstreamTransformerBuilder(org.apache.ignite.ml.dataset.UpstreamTransformerBuilder) Collections(java.util.Collections) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry) IgniteAtomicLong(org.apache.ignite.IgniteAtomicLong) UUID(java.util.UUID) CacheConfiguration(org.apache.ignite.configuration.CacheConfiguration) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry) GridCommonAbstractTest(org.apache.ignite.testframework.junits.common.GridCommonAbstractTest) Test(org.junit.Test)

Example 5 with UpstreamEntry

use of org.apache.ignite.ml.dataset.UpstreamEntry in project ignite by apache.

the class MinMaxScalerTrainer method fit.

/**
 * {@inheritDoc}
 */
@Override
public MinMaxScalerPreprocessor<K, V> fit(LearningEnvironmentBuilder envBuilder, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> basePreprocessor) {
    PartitionContextBuilder<K, V, EmptyContext> ctxBuilder = (env, upstream, upstreamSize) -> new EmptyContext();
    try (Dataset<EmptyContext, MinMaxScalerPartitionData> dataset = datasetBuilder.build(envBuilder, ctxBuilder, (env, upstream, upstreamSize, ctx) -> {
        double[] min = null;
        double[] max = null;
        while (upstream.hasNext()) {
            UpstreamEntry<K, V> entity = upstream.next();
            LabeledVector row = basePreprocessor.apply(entity.getKey(), entity.getValue());
            if (min == null) {
                min = new double[row.size()];
                Arrays.fill(min, Double.MAX_VALUE);
            } else
                assert min.length == row.size() : "Base preprocessor must return exactly " + min.length + " features";
            if (max == null) {
                max = new double[row.size()];
                Arrays.fill(max, -Double.MAX_VALUE);
            } else
                assert max.length == row.size() : "Base preprocessor must return exactly " + min.length + " features";
            for (int i = 0; i < row.size(); i++) {
                if (row.get(i) < min[i])
                    min[i] = row.get(i);
                if (row.get(i) > max[i])
                    max[i] = row.get(i);
            }
        }
        return new MinMaxScalerPartitionData(min, max);
    }, learningEnvironment(basePreprocessor))) {
        double[][] minMax = dataset.compute(data -> data.getMin() != null ? new double[][] { data.getMin(), data.getMax() } : null, (a, b) -> {
            if (a == null)
                return b;
            if (b == null)
                return a;
            double[][] res = new double[2][];
            res[0] = new double[a[0].length];
            for (int i = 0; i < res[0].length; i++) res[0][i] = Math.min(a[0][i], b[0][i]);
            res[1] = new double[a[1].length];
            for (int i = 0; i < res[1].length; i++) res[1][i] = Math.max(a[1][i], b[1][i]);
            return res;
        });
        return new MinMaxScalerPreprocessor<>(minMax[0], minMax[1], basePreprocessor);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
Also used : Arrays(java.util.Arrays) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) PartitionContextBuilder(org.apache.ignite.ml.dataset.PartitionContextBuilder) Dataset(org.apache.ignite.ml.dataset.Dataset) Preprocessor(org.apache.ignite.ml.preprocessing.Preprocessor) DatasetBuilder(org.apache.ignite.ml.dataset.DatasetBuilder) PreprocessingTrainer(org.apache.ignite.ml.preprocessing.PreprocessingTrainer) LearningEnvironmentBuilder(org.apache.ignite.ml.environment.LearningEnvironmentBuilder) UpstreamEntry(org.apache.ignite.ml.dataset.UpstreamEntry) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) LabeledVector(org.apache.ignite.ml.structures.LabeledVector)

Aggregations

UpstreamEntry (org.apache.ignite.ml.dataset.UpstreamEntry)10 Arrays (java.util.Arrays)7 LearningEnvironmentBuilder (org.apache.ignite.ml.environment.LearningEnvironmentBuilder)6 ArrayList (java.util.ArrayList)5 DatasetBuilder (org.apache.ignite.ml.dataset.DatasetBuilder)5 PartitionContextBuilder (org.apache.ignite.ml.dataset.PartitionContextBuilder)5 EmptyContext (org.apache.ignite.ml.dataset.primitive.context.EmptyContext)5 Serializable (java.io.Serializable)4 List (java.util.List)4 Map (java.util.Map)4 Dataset (org.apache.ignite.ml.dataset.Dataset)4 UpstreamTransformerBuilder (org.apache.ignite.ml.dataset.UpstreamTransformerBuilder)4 LabeledVector (org.apache.ignite.ml.structures.LabeledVector)4 Collection (java.util.Collection)3 HashMap (java.util.HashMap)3 Iterator (java.util.Iterator)3 UUID (java.util.UUID)3 Ignite (org.apache.ignite.Ignite)3 IgniteCache (org.apache.ignite.IgniteCache)3 Ignition (org.apache.ignite.Ignition)3