use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.
the class SparseDistributedMatrixTest method buildKeySet.
/** Build key set for SparseDistributedMatrix. */
private Set<IgniteBiTuple<Integer, IgniteUuid>> buildKeySet(SparseDistributedMatrix m) {
Set<IgniteBiTuple<Integer, IgniteUuid>> set = new HashSet<>();
SparseDistributedMatrixStorage storage = (SparseDistributedMatrixStorage) m.getStorage();
IgniteUuid uuid = storage.getUUID();
int size = storage.storageMode() == StorageConstants.ROW_STORAGE_MODE ? storage.rowSize() : storage.columnSize();
for (int i = 0; i < size; i++) set.add(new IgniteBiTuple<>(i, uuid));
return set;
}
use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.
the class FuzzyCMeansDistributedClusterer method initializeCenters.
/**
* Choose k primary centers from source points.
*
* @param points Matrix with source points.
* @param k Number of centers.
* @return Array of primary centers.
*/
private Vector[] initializeCenters(SparseDistributedMatrix points, int k) {
int pointsNum = points.rowSize();
Vector firstCenter = points.viewRow(rnd.nextInt(pointsNum));
List<Vector> centers = new ArrayList<>();
List<Vector> newCenters = new ArrayList<>();
centers.add(firstCenter);
newCenters.add(firstCenter);
ConcurrentHashMap<Integer, Double> costs = new ConcurrentHashMap<>();
int step = 0;
UUID uuid = points.getUUID();
String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName();
while (step < initSteps) {
ConcurrentHashMap<Integer, Double> newCosts = getNewCosts(cacheName, uuid, newCenters);
for (Integer key : newCosts.keySet()) costs.merge(key, newCosts.get(key), Math::min);
double costsSum = costs.values().stream().mapToDouble(Double::valueOf).sum();
newCenters = getNewCenters(cacheName, uuid, costs, costsSum, k);
centers.addAll(newCenters);
step++;
}
return chooseKCenters(cacheName, uuid, centers, k);
}
use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.
the class KMeansDistributedClusterer method cluster.
/**
*/
@Override
public KMeansModel cluster(SparseDistributedMatrix points, int k) throws MathIllegalArgumentException, ConvergenceException {
SparseDistributedMatrix pointsCp = (SparseDistributedMatrix) points.like(points.rowSize(), points.columnSize());
String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName();
// TODO: IGNITE-5825, this copy is very ineffective, just for POC. Immutability of data should be guaranteed by other methods
// such as logical locks for example.
pointsCp.assign(points);
Vector[] centers = initClusterCenters(pointsCp, k);
boolean converged = false;
int iteration = 0;
int dim = pointsCp.viewRow(0).size();
UUID uid = pointsCp.getUUID();
// Execute iterations of Lloyd's algorithm until converged
while (iteration < maxIterations && !converged) {
SumsAndCounts stats = getSumsAndCounts(centers, dim, uid, cacheName);
converged = true;
for (Integer ind : stats.sums.keySet()) {
Vector massCenter = stats.sums.get(ind).times(1.0 / stats.counts.get(ind));
if (converged && distance(massCenter, centers[ind]) > epsilon * epsilon)
converged = false;
centers[ind] = massCenter;
}
iteration++;
}
pointsCp.destroy();
return new KMeansModel(centers, getDistanceMeasure());
}
use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.
the class ColumnDecisionTreeTrainerBenchmark method tstMNISTSparseDistributedMatrix.
/**
* Run decision tree classifier on MNIST using sparse distributed matrix as a storage for dataset.
* To run this test rename this method so it starts from 'test'.
*
* @throws IOException In case of loading MNIST dataset errors.
*/
public void tstMNISTSparseDistributedMatrix() throws IOException {
IgniteUtils.setCurrentIgniteName(ignite.configuration().getIgniteInstanceName());
int ptsCnt = 30_000;
int featCnt = 28 * 28;
Properties props = loadMNISTProperties();
Stream<DenseLocalOnHeapVector> trainingMnistStream = MnistUtils.mnist(props.getProperty(PROP_TRAINING_IMAGES), props.getProperty(PROP_TRAINING_LABELS), new Random(123L), ptsCnt);
Stream<DenseLocalOnHeapVector> testMnistStream = MnistUtils.mnist(props.getProperty(PROP_TEST_IMAGES), props.getProperty(PROP_TEST_LABELS), new Random(123L), 10_000);
SparseDistributedMatrix m = new SparseDistributedMatrix(ptsCnt, featCnt + 1, StorageConstants.COLUMN_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE);
SparseDistributedMatrixStorage sto = (SparseDistributedMatrixStorage) m.getStorage();
loadVectorsIntoSparseDistributedMatrixCache(sto.cache().getName(), sto.getUUID(), trainingMnistStream.iterator(), featCnt + 1);
ColumnDecisionTreeTrainer<GiniSplitCalculator.GiniData> trainer = new ColumnDecisionTreeTrainer<>(10, ContinuousSplitCalculators.GINI.apply(ignite), RegionCalculators.GINI, RegionCalculators.MOST_COMMON, ignite);
X.println("Training started");
long before = System.currentTimeMillis();
DecisionTreeModel mdl = trainer.train(new MatrixColumnDecisionTreeTrainerInput(m, new HashMap<>()));
X.println("Training finished in " + (System.currentTimeMillis() - before));
IgniteTriFunction<Model<Vector, Double>, Stream<IgniteBiTuple<Vector, Double>>, Function<Double, Double>, Double> mse = Estimators.errorsPercentage();
Double accuracy = mse.apply(mdl, testMnistStream.map(v -> new IgniteBiTuple<>(v.viewPart(0, featCnt), v.getX(featCnt))), Function.identity());
X.println("Errors percentage: " + accuracy);
Assert.assertEquals(0, SplitCache.getOrCreate(ignite).size());
Assert.assertEquals(0, FeaturesCache.getOrCreate(ignite).size());
Assert.assertEquals(0, ContextCache.getOrCreate(ignite).size());
Assert.assertEquals(0, ProjectionsCache.getOrCreate(ignite).size());
}
use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.
the class ColumnDecisionTreeTrainerBenchmark method testByGenStreamerLoad.
/**
*/
private void testByGenStreamerLoad(int ptsPerReg, HashMap<Integer, Integer> catsInfo, SplitDataGenerator<DenseLocalOnHeapVector> gen, Random rnd) {
List<IgniteBiTuple<Integer, DenseLocalOnHeapVector>> lst = gen.points(ptsPerReg, (i, rn) -> i).collect(Collectors.toList());
int featCnt = gen.featuresCnt();
Collections.shuffle(lst, rnd);
int numRegs = gen.regsCount();
SparseDistributedMatrix m = new SparseDistributedMatrix(numRegs * ptsPerReg, featCnt + 1, StorageConstants.COLUMN_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE);
IgniteFunction<DoubleStream, Double> regCalc = s -> s.average().orElse(0.0);
Map<Integer, List<LabeledVectorDouble>> byRegion = new HashMap<>();
SparseDistributedMatrixStorage sto = (SparseDistributedMatrixStorage) m.getStorage();
long before = System.currentTimeMillis();
X.println("Batch loading started...");
loadVectorsIntoSparseDistributedMatrixCache(sto.cache().getName(), sto.getUUID(), gen.points(ptsPerReg, (i, rn) -> i).map(IgniteBiTuple::get2).iterator(), featCnt + 1);
X.println("Batch loading took " + (System.currentTimeMillis() - before) + " ms.");
for (IgniteBiTuple<Integer, DenseLocalOnHeapVector> bt : lst) {
byRegion.putIfAbsent(bt.get1(), new LinkedList<>());
byRegion.get(bt.get1()).add(asLabeledVector(bt.get2().getStorage().data()));
}
ColumnDecisionTreeTrainer<VarianceSplitCalculator.VarianceData> trainer = new ColumnDecisionTreeTrainer<>(2, ContinuousSplitCalculators.VARIANCE, RegionCalculators.VARIANCE, regCalc, ignite);
before = System.currentTimeMillis();
DecisionTreeModel mdl = trainer.train(new MatrixColumnDecisionTreeTrainerInput(m, catsInfo));
X.println("Training took: " + (System.currentTimeMillis() - before) + " ms.");
byRegion.keySet().forEach(k -> {
LabeledVectorDouble sp = byRegion.get(k).get(0);
Tracer.showAscii(sp.features());
X.println("Predicted value and label [pred=" + mdl.apply(sp.features()) + ", label=" + sp.doubleLabel() + "]");
assert mdl.apply(sp.features()) == sp.doubleLabel();
});
}
Aggregations