Search in sources :

Example 81 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class KMeansTrainer method initClusterCentersRandomly.

/**
 * K cluster centers are initialized randomly.
 *
 * @param dataset The dataset to pick up random centers.
 * @param k Amount of clusters.
 * @return K cluster centers.
 */
private Vector[] initClusterCentersRandomly(Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset, int k) {
    Vector[] initCenters = new DenseVector[k];
    // Gets k or less vectors from each partition.
    List<LabeledVector> rndPnts = dataset.compute(data -> {
        List<LabeledVector> rndPnt = new ArrayList<>();
        if (data.rowSize() != 0) {
            if (data.rowSize() > k) {
                // If it's enough rows in partition to pick k vectors.
                final Random random = environment.randomNumbersGenerator();
                for (int i = 0; i < k; i++) {
                    Set<Integer> uniqueIndices = new HashSet<>();
                    int nextIdx = random.nextInt(data.rowSize());
                    // It required to make the next cycle is finite.
                    int maxRandomSearch = k;
                    int cntr = 0;
                    // Repeat nextIdx generation if it was picked earlier.
                    while (uniqueIndices.contains(nextIdx) && cntr < maxRandomSearch) {
                        nextIdx = random.nextInt(data.rowSize());
                        cntr++;
                    }
                    uniqueIndices.add(nextIdx);
                    rndPnt.add(data.getRow(nextIdx));
                }
            } else
                // If it's not enough vectors to pick k vectors.
                for (int i = 0; i < data.rowSize(); i++) rndPnt.add(data.getRow(i));
        }
        return rndPnt;
    }, (a, b) -> {
        if (a == null)
            return b == null ? new ArrayList<>() : b;
        if (b == null)
            return a;
        return Stream.concat(a.stream(), b.stream()).collect(Collectors.toList());
    });
    // Shuffle them.
    Collections.shuffle(rndPnts);
    // Pick k vectors randomly.
    if (rndPnts.size() >= k) {
        for (int i = 0; i < k; i++) {
            final LabeledVector rndPnt = rndPnts.get(environment.randomNumbersGenerator().nextInt(rndPnts.size()));
            rndPnts.remove(rndPnt);
            initCenters[i] = rndPnt.features();
        }
    } else
        throw new RuntimeException("The KMeans Trainer required more than " + k + " vectors to find " + k + " clusters");
    return initCenters;
}
Also used : Random(java.util.Random) ArrayList(java.util.ArrayList) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector) HashSet(java.util.HashSet)

Example 82 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class KMeansTrainer method updateModel.

/**
 * {@inheritDoc}
 */
@Override
protected <K, V> KMeansModel updateModel(KMeansModel mdl, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
    assert datasetBuilder != null;
    PartitionDataBuilder<K, V, EmptyContext, LabeledVectorSet<LabeledVector>> partDataBuilder = new LabeledDatasetPartitionDataBuilderOnHeap<>(preprocessor);
    Vector[] centers;
    try (Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), partDataBuilder, learningEnvironment())) {
        final Integer cols = dataset.compute(org.apache.ignite.ml.structures.Dataset::colSize, (a, b) -> {
            if (a == null)
                return b == null ? 0 : b;
            if (b == null)
                return a;
            return b;
        });
        if (cols == null)
            return getLastTrainedModelOrThrowEmptyDatasetException(mdl);
        centers = Optional.ofNullable(mdl).map(KMeansModel::centers).orElseGet(() -> initClusterCentersRandomly(dataset, k));
        boolean converged = false;
        int iteration = 0;
        while (iteration < maxIterations && !converged) {
            Vector[] newCentroids = new DenseVector[k];
            TotalCostAndCounts totalRes = calcDataForNewCentroids(centers, dataset, cols);
            converged = true;
            for (Map.Entry<Integer, Vector> entry : totalRes.sums.entrySet()) {
                Vector massCenter = entry.getValue().times(1.0 / totalRes.counts.get(entry.getKey()));
                if (converged && distance.compute(massCenter, centers[entry.getKey()]) > epsilon * epsilon)
                    converged = false;
                newCentroids[entry.getKey()] = massCenter;
            }
            iteration++;
            for (int i = 0; i < centers.length; i++) {
                if (newCentroids[i] != null)
                    centers[i] = newCentroids[i];
            }
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    return new KMeansModel(centers, distance);
}
Also used : EmptyContext(org.apache.ignite.ml.dataset.primitive.context.EmptyContext) LabeledDatasetPartitionDataBuilderOnHeap(org.apache.ignite.ml.structures.partition.LabeledDatasetPartitionDataBuilderOnHeap) Dataset(org.apache.ignite.ml.dataset.Dataset) LabeledVectorSet(org.apache.ignite.ml.structures.LabeledVectorSet) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) DenseVector(org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)

Example 83 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class GDBLearningStrategy method update.

/**
 * Gets state of model in arguments, compare it with training parameters of trainer and if they are fit then trainer
 * updates model in according to new data and return new model. In other case trains new model.
 *
 * @param mdlToUpdate Learned model.
 * @param datasetBuilder Dataset builder.
 * @param preprocessor Upstream preprocessor.
 * @param <K> Type of a key in {@code upstream} data.
 * @param <V> Type of a value in {@code upstream} data.
 * @return Updated models list.
 */
public <K, V> List<IgniteModel<Vector, Double>> update(GDBModel mdlToUpdate, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
    if (trainerEnvironment == null)
        throw new IllegalStateException("Learning environment builder is not set.");
    List<IgniteModel<Vector, Double>> models = initLearningState(mdlToUpdate);
    ConvergenceChecker<K, V> convCheck = checkConvergenceStgyFactory.create(sampleSize, externalLbToInternalMapping, loss, datasetBuilder, preprocessor);
    DatasetTrainer<? extends IgniteModel<Vector, Double>, Double> trainer = baseMdlTrainerBuilder.get();
    for (int i = 0; i < cntOfIterations; i++) {
        double[] weights = Arrays.copyOf(compositionWeights, models.size());
        WeightedPredictionsAggregator aggregator = new WeightedPredictionsAggregator(weights, meanLbVal);
        ModelsComposition currComposition = new ModelsComposition(models, aggregator);
        if (convCheck.isConverged(envBuilder, datasetBuilder, currComposition))
            break;
        Vectorizer<K, V, Serializable, Double> extractor = new Vectorizer.VectorizerAdapter<K, V, Serializable, Double>() {

            /**
             * {@inheritDoc}
             */
            @Override
            public LabeledVector<Double> extract(K k, V v) {
                LabeledVector<Double> labeledVector = preprocessor.apply(k, v);
                Vector features = labeledVector.features();
                Double realAnswer = externalLbToInternalMapping.apply(labeledVector.label());
                Double mdlAnswer = currComposition.predict(features);
                return new LabeledVector<>(features, -loss.gradient(sampleSize, realAnswer, mdlAnswer));
            }
        };
        long startTs = System.currentTimeMillis();
        models.add(trainer.fit(datasetBuilder, extractor));
        double learningTime = (double) (System.currentTimeMillis() - startTs) / 1000.0;
        trainerEnvironment.logger(getClass()).log(MLLogger.VerboseLevel.LOW, "One model training time was %.2fs", learningTime);
    }
    return models;
}
Also used : Serializable(java.io.Serializable) WeightedPredictionsAggregator(org.apache.ignite.ml.composition.predictionsaggregator.WeightedPredictionsAggregator) ModelsComposition(org.apache.ignite.ml.composition.ModelsComposition) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) IgniteModel(org.apache.ignite.ml.IgniteModel) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector)

Example 84 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class GmmPartitionData method updatePcxi.

/**
 * Updates P(c|xi) values in partitions given components probabilities and components of GMM.
 *
 * @param clusterProbs Component probabilities.
 * @param components Components.
 */
static double updatePcxi(GmmPartitionData data, Vector clusterProbs, List<MultivariateGaussianDistribution> components) {
    GmmModel model = new GmmModel(clusterProbs, components);
    double maxProb = Double.NEGATIVE_INFINITY;
    for (int i = 0; i < data.size(); i++) {
        Vector x = data.getX(i);
        double xProb = model.prob(x);
        if (xProb > maxProb)
            maxProb = xProb;
        double normalizer = 0.0;
        for (int c = 0; c < clusterProbs.size(); c++) normalizer += components.get(c).prob(x) * clusterProbs.get(c);
        for (int c = 0; c < clusterProbs.size(); c++) data.pcxi[i][c] = (components.get(c).prob(x) * clusterProbs.get(c)) / normalizer;
    }
    return maxProb;
}
Also used : Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector)

Example 85 with Vector

use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.

the class MeanWithClusterProbAggregator method add.

/**
 * Add vector to statistics.
 *
 * @param x Vector.
 * @param pcxi P(c|xi).
 */
void add(Vector x, double pcxi) {
    A.ensure(pcxi >= 0 && pcxi <= 1., "pcxi >= 0 && pcxi <= 1.");
    Vector weightedVector = x.times(pcxi);
    if (weightedXsSum == null)
        weightedXsSum = weightedVector;
    else
        weightedXsSum = weightedXsSum.plus(weightedVector);
    pcxiSum += pcxi;
    rowCount += 1;
}
Also used : Vector(org.apache.ignite.ml.math.primitives.vector.Vector)

Aggregations

Vector (org.apache.ignite.ml.math.primitives.vector.Vector)265 DenseVector (org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)95 Test (org.junit.Test)94 Ignite (org.apache.ignite.Ignite)78 LabeledVector (org.apache.ignite.ml.structures.LabeledVector)49 HashMap (java.util.HashMap)39 SandboxMLCache (org.apache.ignite.examples.ml.util.SandboxMLCache)38 DummyVectorizer (org.apache.ignite.ml.dataset.feature.extractor.impl.DummyVectorizer)26 FileNotFoundException (java.io.FileNotFoundException)22 TrainerTest (org.apache.ignite.ml.common.TrainerTest)22 DecisionTreeClassificationTrainer (org.apache.ignite.ml.tree.DecisionTreeClassificationTrainer)21 DecisionTreeModel (org.apache.ignite.ml.tree.DecisionTreeModel)21 Serializable (java.io.Serializable)19 IgniteCache (org.apache.ignite.IgniteCache)18 EncoderTrainer (org.apache.ignite.ml.preprocessing.encoding.EncoderTrainer)16 Cache (javax.cache.Cache)15 DoubleArrayVectorizer (org.apache.ignite.ml.dataset.feature.extractor.impl.DoubleArrayVectorizer)15 EuclideanDistance (org.apache.ignite.ml.math.distances.EuclideanDistance)14 ArrayList (java.util.ArrayList)12 ModelsComposition (org.apache.ignite.ml.composition.ModelsComposition)12