use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class KMeansTrainer method initClusterCentersRandomly.
/**
* K cluster centers are initialized randomly.
*
* @param dataset The dataset to pick up random centers.
* @param k Amount of clusters.
* @return K cluster centers.
*/
private Vector[] initClusterCentersRandomly(Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset, int k) {
Vector[] initCenters = new DenseVector[k];
// Gets k or less vectors from each partition.
List<LabeledVector> rndPnts = dataset.compute(data -> {
List<LabeledVector> rndPnt = new ArrayList<>();
if (data.rowSize() != 0) {
if (data.rowSize() > k) {
// If it's enough rows in partition to pick k vectors.
final Random random = environment.randomNumbersGenerator();
for (int i = 0; i < k; i++) {
Set<Integer> uniqueIndices = new HashSet<>();
int nextIdx = random.nextInt(data.rowSize());
// It required to make the next cycle is finite.
int maxRandomSearch = k;
int cntr = 0;
// Repeat nextIdx generation if it was picked earlier.
while (uniqueIndices.contains(nextIdx) && cntr < maxRandomSearch) {
nextIdx = random.nextInt(data.rowSize());
cntr++;
}
uniqueIndices.add(nextIdx);
rndPnt.add(data.getRow(nextIdx));
}
} else
// If it's not enough vectors to pick k vectors.
for (int i = 0; i < data.rowSize(); i++) rndPnt.add(data.getRow(i));
}
return rndPnt;
}, (a, b) -> {
if (a == null)
return b == null ? new ArrayList<>() : b;
if (b == null)
return a;
return Stream.concat(a.stream(), b.stream()).collect(Collectors.toList());
});
// Shuffle them.
Collections.shuffle(rndPnts);
// Pick k vectors randomly.
if (rndPnts.size() >= k) {
for (int i = 0; i < k; i++) {
final LabeledVector rndPnt = rndPnts.get(environment.randomNumbersGenerator().nextInt(rndPnts.size()));
rndPnts.remove(rndPnt);
initCenters[i] = rndPnt.features();
}
} else
throw new RuntimeException("The KMeans Trainer required more than " + k + " vectors to find " + k + " clusters");
return initCenters;
}
use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class KMeansTrainer method updateModel.
/**
* {@inheritDoc}
*/
@Override
protected <K, V> KMeansModel updateModel(KMeansModel mdl, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
assert datasetBuilder != null;
PartitionDataBuilder<K, V, EmptyContext, LabeledVectorSet<LabeledVector>> partDataBuilder = new LabeledDatasetPartitionDataBuilderOnHeap<>(preprocessor);
Vector[] centers;
try (Dataset<EmptyContext, LabeledVectorSet<LabeledVector>> dataset = datasetBuilder.build(envBuilder, (env, upstream, upstreamSize) -> new EmptyContext(), partDataBuilder, learningEnvironment())) {
final Integer cols = dataset.compute(org.apache.ignite.ml.structures.Dataset::colSize, (a, b) -> {
if (a == null)
return b == null ? 0 : b;
if (b == null)
return a;
return b;
});
if (cols == null)
return getLastTrainedModelOrThrowEmptyDatasetException(mdl);
centers = Optional.ofNullable(mdl).map(KMeansModel::centers).orElseGet(() -> initClusterCentersRandomly(dataset, k));
boolean converged = false;
int iteration = 0;
while (iteration < maxIterations && !converged) {
Vector[] newCentroids = new DenseVector[k];
TotalCostAndCounts totalRes = calcDataForNewCentroids(centers, dataset, cols);
converged = true;
for (Map.Entry<Integer, Vector> entry : totalRes.sums.entrySet()) {
Vector massCenter = entry.getValue().times(1.0 / totalRes.counts.get(entry.getKey()));
if (converged && distance.compute(massCenter, centers[entry.getKey()]) > epsilon * epsilon)
converged = false;
newCentroids[entry.getKey()] = massCenter;
}
iteration++;
for (int i = 0; i < centers.length; i++) {
if (newCentroids[i] != null)
centers[i] = newCentroids[i];
}
}
} catch (Exception e) {
throw new RuntimeException(e);
}
return new KMeansModel(centers, distance);
}
use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class GDBLearningStrategy method update.
/**
* Gets state of model in arguments, compare it with training parameters of trainer and if they are fit then trainer
* updates model in according to new data and return new model. In other case trains new model.
*
* @param mdlToUpdate Learned model.
* @param datasetBuilder Dataset builder.
* @param preprocessor Upstream preprocessor.
* @param <K> Type of a key in {@code upstream} data.
* @param <V> Type of a value in {@code upstream} data.
* @return Updated models list.
*/
public <K, V> List<IgniteModel<Vector, Double>> update(GDBModel mdlToUpdate, DatasetBuilder<K, V> datasetBuilder, Preprocessor<K, V> preprocessor) {
if (trainerEnvironment == null)
throw new IllegalStateException("Learning environment builder is not set.");
List<IgniteModel<Vector, Double>> models = initLearningState(mdlToUpdate);
ConvergenceChecker<K, V> convCheck = checkConvergenceStgyFactory.create(sampleSize, externalLbToInternalMapping, loss, datasetBuilder, preprocessor);
DatasetTrainer<? extends IgniteModel<Vector, Double>, Double> trainer = baseMdlTrainerBuilder.get();
for (int i = 0; i < cntOfIterations; i++) {
double[] weights = Arrays.copyOf(compositionWeights, models.size());
WeightedPredictionsAggregator aggregator = new WeightedPredictionsAggregator(weights, meanLbVal);
ModelsComposition currComposition = new ModelsComposition(models, aggregator);
if (convCheck.isConverged(envBuilder, datasetBuilder, currComposition))
break;
Vectorizer<K, V, Serializable, Double> extractor = new Vectorizer.VectorizerAdapter<K, V, Serializable, Double>() {
/**
* {@inheritDoc}
*/
@Override
public LabeledVector<Double> extract(K k, V v) {
LabeledVector<Double> labeledVector = preprocessor.apply(k, v);
Vector features = labeledVector.features();
Double realAnswer = externalLbToInternalMapping.apply(labeledVector.label());
Double mdlAnswer = currComposition.predict(features);
return new LabeledVector<>(features, -loss.gradient(sampleSize, realAnswer, mdlAnswer));
}
};
long startTs = System.currentTimeMillis();
models.add(trainer.fit(datasetBuilder, extractor));
double learningTime = (double) (System.currentTimeMillis() - startTs) / 1000.0;
trainerEnvironment.logger(getClass()).log(MLLogger.VerboseLevel.LOW, "One model training time was %.2fs", learningTime);
}
return models;
}
use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class GmmPartitionData method updatePcxi.
/**
* Updates P(c|xi) values in partitions given components probabilities and components of GMM.
*
* @param clusterProbs Component probabilities.
* @param components Components.
*/
static double updatePcxi(GmmPartitionData data, Vector clusterProbs, List<MultivariateGaussianDistribution> components) {
GmmModel model = new GmmModel(clusterProbs, components);
double maxProb = Double.NEGATIVE_INFINITY;
for (int i = 0; i < data.size(); i++) {
Vector x = data.getX(i);
double xProb = model.prob(x);
if (xProb > maxProb)
maxProb = xProb;
double normalizer = 0.0;
for (int c = 0; c < clusterProbs.size(); c++) normalizer += components.get(c).prob(x) * clusterProbs.get(c);
for (int c = 0; c < clusterProbs.size(); c++) data.pcxi[i][c] = (components.get(c).prob(x) * clusterProbs.get(c)) / normalizer;
}
return maxProb;
}
use of org.apache.ignite.ml.math.primitives.vector.Vector in project ignite by apache.
the class MeanWithClusterProbAggregator method add.
/**
* Add vector to statistics.
*
* @param x Vector.
* @param pcxi P(c|xi).
*/
void add(Vector x, double pcxi) {
A.ensure(pcxi >= 0 && pcxi <= 1., "pcxi >= 0 && pcxi <= 1.");
Vector weightedVector = x.times(pcxi);
if (weightedXsSum == null)
weightedXsSum = weightedVector;
else
weightedXsSum = weightedXsSum.plus(weightedVector);
pcxiSum += pcxi;
rowCount += 1;
}
Aggregations