Search in sources :

Example 1 with SparseDistributedMatrixStorage

use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.

the class SparseDistributedMatrixTest method buildKeySet.

/** Build key set for SparseDistributedMatrix. */
private Set<IgniteBiTuple<Integer, IgniteUuid>> buildKeySet(SparseDistributedMatrix m) {
    Set<IgniteBiTuple<Integer, IgniteUuid>> set = new HashSet<>();
    SparseDistributedMatrixStorage storage = (SparseDistributedMatrixStorage) m.getStorage();
    IgniteUuid uuid = storage.getUUID();
    int size = storage.storageMode() == StorageConstants.ROW_STORAGE_MODE ? storage.rowSize() : storage.columnSize();
    for (int i = 0; i < size; i++) set.add(new IgniteBiTuple<>(i, uuid));
    return set;
}
Also used : IgniteBiTuple(org.apache.ignite.lang.IgniteBiTuple) IgniteUuid(org.apache.ignite.lang.IgniteUuid) SparseDistributedMatrixStorage(org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage) HashSet(java.util.HashSet)

Example 2 with SparseDistributedMatrixStorage

use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.

the class FuzzyCMeansDistributedClusterer method initializeCenters.

/**
 * Choose k primary centers from source points.
 *
 * @param points Matrix with source points.
 * @param k Number of centers.
 * @return Array of primary centers.
 */
private Vector[] initializeCenters(SparseDistributedMatrix points, int k) {
    int pointsNum = points.rowSize();
    Vector firstCenter = points.viewRow(rnd.nextInt(pointsNum));
    List<Vector> centers = new ArrayList<>();
    List<Vector> newCenters = new ArrayList<>();
    centers.add(firstCenter);
    newCenters.add(firstCenter);
    ConcurrentHashMap<Integer, Double> costs = new ConcurrentHashMap<>();
    int step = 0;
    UUID uuid = points.getUUID();
    String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName();
    while (step < initSteps) {
        ConcurrentHashMap<Integer, Double> newCosts = getNewCosts(cacheName, uuid, newCenters);
        for (Integer key : newCosts.keySet()) costs.merge(key, newCosts.get(key), Math::min);
        double costsSum = costs.values().stream().mapToDouble(Double::valueOf).sum();
        newCenters = getNewCenters(cacheName, uuid, costs, costsSum, k);
        centers.addAll(newCenters);
        step++;
    }
    return chooseKCenters(cacheName, uuid, centers, k);
}
Also used : ArrayList(java.util.ArrayList) SparseDistributedMatrixStorage(org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) UUID(java.util.UUID) Vector(org.apache.ignite.ml.math.Vector) DenseLocalOnHeapVector(org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector)

Example 3 with SparseDistributedMatrixStorage

use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.

the class KMeansDistributedClusterer method cluster.

/**
 */
@Override
public KMeansModel cluster(SparseDistributedMatrix points, int k) throws MathIllegalArgumentException, ConvergenceException {
    SparseDistributedMatrix pointsCp = (SparseDistributedMatrix) points.like(points.rowSize(), points.columnSize());
    String cacheName = ((SparseDistributedMatrixStorage) points.getStorage()).cacheName();
    // TODO: IGNITE-5825, this copy is very ineffective, just for POC. Immutability of data should be guaranteed by other methods
    // such as logical locks for example.
    pointsCp.assign(points);
    Vector[] centers = initClusterCenters(pointsCp, k);
    boolean converged = false;
    int iteration = 0;
    int dim = pointsCp.viewRow(0).size();
    UUID uid = pointsCp.getUUID();
    // Execute iterations of Lloyd's algorithm until converged
    while (iteration < maxIterations && !converged) {
        SumsAndCounts stats = getSumsAndCounts(centers, dim, uid, cacheName);
        converged = true;
        for (Integer ind : stats.sums.keySet()) {
            Vector massCenter = stats.sums.get(ind).times(1.0 / stats.counts.get(ind));
            if (converged && distance(massCenter, centers[ind]) > epsilon * epsilon)
                converged = false;
            centers[ind] = massCenter;
        }
        iteration++;
    }
    pointsCp.destroy();
    return new KMeansModel(centers, getDistanceMeasure());
}
Also used : SparseDistributedMatrix(org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix) SparseDistributedMatrixStorage(org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage) UUID(java.util.UUID) Vector(org.apache.ignite.ml.math.Vector)

Example 4 with SparseDistributedMatrixStorage

use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.

the class ColumnDecisionTreeTrainerBenchmark method tstMNISTSparseDistributedMatrix.

/**
 * Run decision tree classifier on MNIST using sparse distributed matrix as a storage for dataset.
 * To run this test rename this method so it starts from 'test'.
 *
 * @throws IOException In case of loading MNIST dataset errors.
 */
public void tstMNISTSparseDistributedMatrix() throws IOException {
    IgniteUtils.setCurrentIgniteName(ignite.configuration().getIgniteInstanceName());
    int ptsCnt = 30_000;
    int featCnt = 28 * 28;
    Properties props = loadMNISTProperties();
    Stream<DenseLocalOnHeapVector> trainingMnistStream = MnistUtils.mnist(props.getProperty(PROP_TRAINING_IMAGES), props.getProperty(PROP_TRAINING_LABELS), new Random(123L), ptsCnt);
    Stream<DenseLocalOnHeapVector> testMnistStream = MnistUtils.mnist(props.getProperty(PROP_TEST_IMAGES), props.getProperty(PROP_TEST_LABELS), new Random(123L), 10_000);
    SparseDistributedMatrix m = new SparseDistributedMatrix(ptsCnt, featCnt + 1, StorageConstants.COLUMN_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE);
    SparseDistributedMatrixStorage sto = (SparseDistributedMatrixStorage) m.getStorage();
    loadVectorsIntoSparseDistributedMatrixCache(sto.cache().getName(), sto.getUUID(), trainingMnistStream.iterator(), featCnt + 1);
    ColumnDecisionTreeTrainer<GiniSplitCalculator.GiniData> trainer = new ColumnDecisionTreeTrainer<>(10, ContinuousSplitCalculators.GINI.apply(ignite), RegionCalculators.GINI, RegionCalculators.MOST_COMMON, ignite);
    X.println("Training started");
    long before = System.currentTimeMillis();
    DecisionTreeModel mdl = trainer.train(new MatrixColumnDecisionTreeTrainerInput(m, new HashMap<>()));
    X.println("Training finished in " + (System.currentTimeMillis() - before));
    IgniteTriFunction<Model<Vector, Double>, Stream<IgniteBiTuple<Vector, Double>>, Function<Double, Double>, Double> mse = Estimators.errorsPercentage();
    Double accuracy = mse.apply(mdl, testMnistStream.map(v -> new IgniteBiTuple<>(v.viewPart(0, featCnt), v.getX(featCnt))), Function.identity());
    X.println("Errors percentage: " + accuracy);
    Assert.assertEquals(0, SplitCache.getOrCreate(ignite).size());
    Assert.assertEquals(0, FeaturesCache.getOrCreate(ignite).size());
    Assert.assertEquals(0, ContextCache.getOrCreate(ignite).size());
    Assert.assertEquals(0, ProjectionsCache.getOrCreate(ignite).size());
}
Also used : CacheAtomicityMode(org.apache.ignite.cache.CacheAtomicityMode) Arrays(java.util.Arrays) FeaturesCache(org.apache.ignite.ml.trees.trainers.columnbased.caches.FeaturesCache) IgniteTestResources(org.apache.ignite.testframework.junits.IgniteTestResources) Random(java.util.Random) BiIndex(org.apache.ignite.ml.trees.trainers.columnbased.BiIndex) SparseDistributedMatrix(org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix) SparseDistributedMatrixStorage(org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage) VarianceSplitCalculator(org.apache.ignite.ml.trees.trainers.columnbased.contsplitcalcs.VarianceSplitCalculator) Vector(org.apache.ignite.ml.math.Vector) Estimators(org.apache.ignite.ml.estimators.Estimators) Map(java.util.Map) X(org.apache.ignite.internal.util.typedef.X) Level(org.apache.log4j.Level) DenseLocalOnHeapVector(org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector) MatrixColumnDecisionTreeTrainerInput(org.apache.ignite.ml.trees.trainers.columnbased.MatrixColumnDecisionTreeTrainerInput) LabeledVectorDouble(org.apache.ignite.ml.structures.LabeledVectorDouble) BaseDecisionTreeTest(org.apache.ignite.ml.trees.BaseDecisionTreeTest) IgniteTriFunction(org.apache.ignite.ml.math.functions.IgniteTriFunction) ProjectionsCache(org.apache.ignite.ml.trees.trainers.columnbased.caches.ProjectionsCache) UUID(java.util.UUID) StreamTransformer(org.apache.ignite.stream.StreamTransformer) Collectors(java.util.stream.Collectors) IgniteCache(org.apache.ignite.IgniteCache) ContextCache(org.apache.ignite.ml.trees.trainers.columnbased.caches.ContextCache) DoubleStream(java.util.stream.DoubleStream) IgniteBiTuple(org.apache.ignite.lang.IgniteBiTuple) List(java.util.List) IgniteConfiguration(org.apache.ignite.configuration.IgniteConfiguration) Stream(java.util.stream.Stream) SparseMatrixKey(org.apache.ignite.ml.math.distributed.keys.impl.SparseMatrixKey) SplitCache(org.apache.ignite.ml.trees.trainers.columnbased.caches.SplitCache) RegionCalculators(org.apache.ignite.ml.trees.trainers.columnbased.regcalcs.RegionCalculators) IntStream(java.util.stream.IntStream) DecisionTreeModel(org.apache.ignite.ml.trees.models.DecisionTreeModel) IgniteFunction(org.apache.ignite.ml.math.functions.IgniteFunction) Model(org.apache.ignite.ml.Model) HashMap(java.util.HashMap) Function(java.util.function.Function) GiniSplitCalculator(org.apache.ignite.ml.trees.trainers.columnbased.contsplitcalcs.GiniSplitCalculator) BiIndexedCacheColumnDecisionTreeTrainerInput(org.apache.ignite.ml.trees.trainers.columnbased.BiIndexedCacheColumnDecisionTreeTrainerInput) CacheWriteSynchronizationMode(org.apache.ignite.cache.CacheWriteSynchronizationMode) IgniteUtils(org.apache.ignite.internal.util.IgniteUtils) MnistUtils(org.apache.ignite.ml.util.MnistUtils) LinkedList(java.util.LinkedList) Properties(java.util.Properties) Iterator(java.util.Iterator) ContinuousSplitCalculators(org.apache.ignite.ml.trees.trainers.columnbased.contsplitcalcs.ContinuousSplitCalculators) IOException(java.io.IOException) SplitDataGenerator(org.apache.ignite.ml.trees.SplitDataGenerator) Int2DoubleOpenHashMap(it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap) Ignition(org.apache.ignite.Ignition) CacheConfiguration(org.apache.ignite.configuration.CacheConfiguration) IgniteDataStreamer(org.apache.ignite.IgniteDataStreamer) Tracer(org.apache.ignite.ml.math.Tracer) StorageConstants(org.apache.ignite.ml.math.StorageConstants) Assert(org.junit.Assert) Collections(java.util.Collections) ColumnDecisionTreeTrainer(org.apache.ignite.ml.trees.trainers.columnbased.ColumnDecisionTreeTrainer) GridCacheProcessor(org.apache.ignite.internal.processors.cache.GridCacheProcessor) InputStream(java.io.InputStream) CacheMode(org.apache.ignite.cache.CacheMode) SparseDistributedMatrix(org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix) MatrixColumnDecisionTreeTrainerInput(org.apache.ignite.ml.trees.trainers.columnbased.MatrixColumnDecisionTreeTrainerInput) HashMap(java.util.HashMap) Int2DoubleOpenHashMap(it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap) IgniteBiTuple(org.apache.ignite.lang.IgniteBiTuple) DecisionTreeModel(org.apache.ignite.ml.trees.models.DecisionTreeModel) SparseDistributedMatrixStorage(org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage) Properties(java.util.Properties) LabeledVectorDouble(org.apache.ignite.ml.structures.LabeledVectorDouble) IgniteTriFunction(org.apache.ignite.ml.math.functions.IgniteTriFunction) IgniteFunction(org.apache.ignite.ml.math.functions.IgniteFunction) Function(java.util.function.Function) Random(java.util.Random) DecisionTreeModel(org.apache.ignite.ml.trees.models.DecisionTreeModel) Model(org.apache.ignite.ml.Model) DoubleStream(java.util.stream.DoubleStream) Stream(java.util.stream.Stream) IntStream(java.util.stream.IntStream) InputStream(java.io.InputStream) DenseLocalOnHeapVector(org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector) Vector(org.apache.ignite.ml.math.Vector) DenseLocalOnHeapVector(org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector) ColumnDecisionTreeTrainer(org.apache.ignite.ml.trees.trainers.columnbased.ColumnDecisionTreeTrainer)

Example 5 with SparseDistributedMatrixStorage

use of org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage in project ignite by apache.

the class ColumnDecisionTreeTrainerBenchmark method testByGenStreamerLoad.

/**
 */
private void testByGenStreamerLoad(int ptsPerReg, HashMap<Integer, Integer> catsInfo, SplitDataGenerator<DenseLocalOnHeapVector> gen, Random rnd) {
    List<IgniteBiTuple<Integer, DenseLocalOnHeapVector>> lst = gen.points(ptsPerReg, (i, rn) -> i).collect(Collectors.toList());
    int featCnt = gen.featuresCnt();
    Collections.shuffle(lst, rnd);
    int numRegs = gen.regsCount();
    SparseDistributedMatrix m = new SparseDistributedMatrix(numRegs * ptsPerReg, featCnt + 1, StorageConstants.COLUMN_STORAGE_MODE, StorageConstants.RANDOM_ACCESS_MODE);
    IgniteFunction<DoubleStream, Double> regCalc = s -> s.average().orElse(0.0);
    Map<Integer, List<LabeledVectorDouble>> byRegion = new HashMap<>();
    SparseDistributedMatrixStorage sto = (SparseDistributedMatrixStorage) m.getStorage();
    long before = System.currentTimeMillis();
    X.println("Batch loading started...");
    loadVectorsIntoSparseDistributedMatrixCache(sto.cache().getName(), sto.getUUID(), gen.points(ptsPerReg, (i, rn) -> i).map(IgniteBiTuple::get2).iterator(), featCnt + 1);
    X.println("Batch loading took " + (System.currentTimeMillis() - before) + " ms.");
    for (IgniteBiTuple<Integer, DenseLocalOnHeapVector> bt : lst) {
        byRegion.putIfAbsent(bt.get1(), new LinkedList<>());
        byRegion.get(bt.get1()).add(asLabeledVector(bt.get2().getStorage().data()));
    }
    ColumnDecisionTreeTrainer<VarianceSplitCalculator.VarianceData> trainer = new ColumnDecisionTreeTrainer<>(2, ContinuousSplitCalculators.VARIANCE, RegionCalculators.VARIANCE, regCalc, ignite);
    before = System.currentTimeMillis();
    DecisionTreeModel mdl = trainer.train(new MatrixColumnDecisionTreeTrainerInput(m, catsInfo));
    X.println("Training took: " + (System.currentTimeMillis() - before) + " ms.");
    byRegion.keySet().forEach(k -> {
        LabeledVectorDouble sp = byRegion.get(k).get(0);
        Tracer.showAscii(sp.features());
        X.println("Predicted value and label [pred=" + mdl.apply(sp.features()) + ", label=" + sp.doubleLabel() + "]");
        assert mdl.apply(sp.features()) == sp.doubleLabel();
    });
}
Also used : CacheAtomicityMode(org.apache.ignite.cache.CacheAtomicityMode) Arrays(java.util.Arrays) FeaturesCache(org.apache.ignite.ml.trees.trainers.columnbased.caches.FeaturesCache) IgniteTestResources(org.apache.ignite.testframework.junits.IgniteTestResources) Random(java.util.Random) BiIndex(org.apache.ignite.ml.trees.trainers.columnbased.BiIndex) SparseDistributedMatrix(org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix) SparseDistributedMatrixStorage(org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage) VarianceSplitCalculator(org.apache.ignite.ml.trees.trainers.columnbased.contsplitcalcs.VarianceSplitCalculator) Vector(org.apache.ignite.ml.math.Vector) Estimators(org.apache.ignite.ml.estimators.Estimators) Map(java.util.Map) X(org.apache.ignite.internal.util.typedef.X) Level(org.apache.log4j.Level) DenseLocalOnHeapVector(org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector) MatrixColumnDecisionTreeTrainerInput(org.apache.ignite.ml.trees.trainers.columnbased.MatrixColumnDecisionTreeTrainerInput) LabeledVectorDouble(org.apache.ignite.ml.structures.LabeledVectorDouble) BaseDecisionTreeTest(org.apache.ignite.ml.trees.BaseDecisionTreeTest) IgniteTriFunction(org.apache.ignite.ml.math.functions.IgniteTriFunction) ProjectionsCache(org.apache.ignite.ml.trees.trainers.columnbased.caches.ProjectionsCache) UUID(java.util.UUID) StreamTransformer(org.apache.ignite.stream.StreamTransformer) Collectors(java.util.stream.Collectors) IgniteCache(org.apache.ignite.IgniteCache) ContextCache(org.apache.ignite.ml.trees.trainers.columnbased.caches.ContextCache) DoubleStream(java.util.stream.DoubleStream) IgniteBiTuple(org.apache.ignite.lang.IgniteBiTuple) List(java.util.List) IgniteConfiguration(org.apache.ignite.configuration.IgniteConfiguration) Stream(java.util.stream.Stream) SparseMatrixKey(org.apache.ignite.ml.math.distributed.keys.impl.SparseMatrixKey) SplitCache(org.apache.ignite.ml.trees.trainers.columnbased.caches.SplitCache) RegionCalculators(org.apache.ignite.ml.trees.trainers.columnbased.regcalcs.RegionCalculators) IntStream(java.util.stream.IntStream) DecisionTreeModel(org.apache.ignite.ml.trees.models.DecisionTreeModel) IgniteFunction(org.apache.ignite.ml.math.functions.IgniteFunction) Model(org.apache.ignite.ml.Model) HashMap(java.util.HashMap) Function(java.util.function.Function) GiniSplitCalculator(org.apache.ignite.ml.trees.trainers.columnbased.contsplitcalcs.GiniSplitCalculator) BiIndexedCacheColumnDecisionTreeTrainerInput(org.apache.ignite.ml.trees.trainers.columnbased.BiIndexedCacheColumnDecisionTreeTrainerInput) CacheWriteSynchronizationMode(org.apache.ignite.cache.CacheWriteSynchronizationMode) IgniteUtils(org.apache.ignite.internal.util.IgniteUtils) MnistUtils(org.apache.ignite.ml.util.MnistUtils) LinkedList(java.util.LinkedList) Properties(java.util.Properties) Iterator(java.util.Iterator) ContinuousSplitCalculators(org.apache.ignite.ml.trees.trainers.columnbased.contsplitcalcs.ContinuousSplitCalculators) IOException(java.io.IOException) SplitDataGenerator(org.apache.ignite.ml.trees.SplitDataGenerator) Int2DoubleOpenHashMap(it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap) Ignition(org.apache.ignite.Ignition) CacheConfiguration(org.apache.ignite.configuration.CacheConfiguration) IgniteDataStreamer(org.apache.ignite.IgniteDataStreamer) Tracer(org.apache.ignite.ml.math.Tracer) StorageConstants(org.apache.ignite.ml.math.StorageConstants) Assert(org.junit.Assert) Collections(java.util.Collections) ColumnDecisionTreeTrainer(org.apache.ignite.ml.trees.trainers.columnbased.ColumnDecisionTreeTrainer) GridCacheProcessor(org.apache.ignite.internal.processors.cache.GridCacheProcessor) InputStream(java.io.InputStream) CacheMode(org.apache.ignite.cache.CacheMode) SparseDistributedMatrix(org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix) LabeledVectorDouble(org.apache.ignite.ml.structures.LabeledVectorDouble) IgniteBiTuple(org.apache.ignite.lang.IgniteBiTuple) HashMap(java.util.HashMap) Int2DoubleOpenHashMap(it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap) MatrixColumnDecisionTreeTrainerInput(org.apache.ignite.ml.trees.trainers.columnbased.MatrixColumnDecisionTreeTrainerInput) DecisionTreeModel(org.apache.ignite.ml.trees.models.DecisionTreeModel) SparseDistributedMatrixStorage(org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage) LabeledVectorDouble(org.apache.ignite.ml.structures.LabeledVectorDouble) DoubleStream(java.util.stream.DoubleStream) List(java.util.List) LinkedList(java.util.LinkedList) DenseLocalOnHeapVector(org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector) ColumnDecisionTreeTrainer(org.apache.ignite.ml.trees.trainers.columnbased.ColumnDecisionTreeTrainer)

Aggregations

SparseDistributedMatrixStorage (org.apache.ignite.ml.math.impls.storage.matrix.SparseDistributedMatrixStorage)11 Vector (org.apache.ignite.ml.math.Vector)9 UUID (java.util.UUID)8 SparseDistributedMatrix (org.apache.ignite.ml.math.impls.matrix.SparseDistributedMatrix)7 Map (java.util.Map)6 DenseLocalOnHeapVector (org.apache.ignite.ml.math.impls.vector.DenseLocalOnHeapVector)6 List (java.util.List)5 Random (java.util.Random)5 Collectors (java.util.stream.Collectors)5 SparseMatrixKey (org.apache.ignite.ml.math.distributed.keys.impl.SparseMatrixKey)5 DenseLocalOnHeapMatrix (org.apache.ignite.ml.math.impls.matrix.DenseLocalOnHeapMatrix)4 Int2DoubleOpenHashMap (it.unimi.dsi.fastutil.ints.Int2DoubleOpenHashMap)3 IOException (java.io.IOException)3 InputStream (java.io.InputStream)3 ArrayList (java.util.ArrayList)3 Arrays (java.util.Arrays)3 Collections (java.util.Collections)3 HashMap (java.util.HashMap)3 Iterator (java.util.Iterator)3 LinkedList (java.util.LinkedList)3