Search in sources :

Example 6 with KMeansModel

use of org.apache.ignite.ml.clustering.kmeans.KMeansModel in project ignite by apache.

the class CustomersClusterizationExample method main.

/**
 * Runs example.
 */
public static void main(String[] args) throws IOException {
    try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
        System.out.println(">>> Ignite grid started.");
        IgniteCache<Integer, Vector> dataCache = null;
        try {
            System.out.println(">>> Fill dataset cache.");
            dataCache = new SandboxMLCache(ignite).fillCacheWith(MLSandboxDatasets.WHOLESALE_CUSTOMERS);
            System.out.println(">>> Start training and scoring.");
            for (int amountOfClusters = 1; amountOfClusters < 10; amountOfClusters++) {
                KMeansTrainer trainer = new KMeansTrainer().withAmountOfClusters(amountOfClusters).withDistance(new EuclideanDistance()).withEnvironmentBuilder(LearningEnvironmentBuilder.defaultBuilder().withRNGSeed(0)).withMaxIterations(50);
                // This vectorizer works with values in cache of Vector class.
                Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>().labeled(// FIRST means "label are stored at first coordinate of vector"
                Vectorizer.LabelCoordinate.FIRST);
                // Splits dataset to train and test samples with 80/20 proportion.
                TrainTestSplit<Integer, Vector> split = new TrainTestDatasetSplitter<Integer, Vector>().split(0.8);
                KMeansModel mdl = trainer.fit(ignite, dataCache, split.getTrainFilter(), vectorizer);
                double entropy = computeMeanEntropy(dataCache, split.getTestFilter(), vectorizer, mdl);
                System.out.println(String.format(">> Clusters mean entropy [%d clusters]: %.2f", amountOfClusters, entropy));
            }
        } finally {
            if (dataCache != null)
                dataCache.destroy();
        }
    } finally {
        System.out.flush();
    }
}
Also used : SandboxMLCache(org.apache.ignite.examples.ml.util.SandboxMLCache) KMeansModel(org.apache.ignite.ml.clustering.kmeans.KMeansModel) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) EuclideanDistance(org.apache.ignite.ml.math.distances.EuclideanDistance) KMeansTrainer(org.apache.ignite.ml.clustering.kmeans.KMeansTrainer) Ignite(org.apache.ignite.Ignite) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector)

Example 7 with KMeansModel

use of org.apache.ignite.ml.clustering.kmeans.KMeansModel in project ignite by apache.

the class CustomersClusterizationExample method computeMeanEntropy.

/**
 * Computes mean entropy in clusters.
 *
 * @param cache      Dataset cache.
 * @param filter     Test dataset filter.
 * @param vectorizer Upstream vectorizer.
 * @param mdl      KMeans model.
 * @return Score.
 */
private static double computeMeanEntropy(IgniteCache<Integer, Vector> cache, IgniteBiPredicate<Integer, Vector> filter, Vectorizer<Integer, Vector, Integer, Double> vectorizer, KMeansModel mdl) {
    Map<Integer, Map<Integer, AtomicInteger>> clusterUniqueLbCounts = new HashMap<>();
    try (QueryCursor<Cache.Entry<Integer, Vector>> cursor = cache.query(new ScanQuery<>(filter))) {
        for (Cache.Entry<Integer, Vector> ent : cursor) {
            LabeledVector<Double> vec = vectorizer.apply(ent.getKey(), ent.getValue());
            int cluster = mdl.predict(vec.features());
            int ch = vec.label().intValue();
            if (!clusterUniqueLbCounts.containsKey(cluster))
                clusterUniqueLbCounts.put(cluster, new HashMap<>());
            if (!clusterUniqueLbCounts.get(cluster).containsKey(ch))
                clusterUniqueLbCounts.get(cluster).put(ch, new AtomicInteger());
            clusterUniqueLbCounts.get(cluster).get(ch).incrementAndGet();
        }
    }
    double sumOfClusterEntropies = 0.0;
    for (Integer cluster : clusterUniqueLbCounts.keySet()) {
        Map<Integer, AtomicInteger> lbCounters = clusterUniqueLbCounts.get(cluster);
        int sizeOfCluster = lbCounters.values().stream().mapToInt(AtomicInteger::get).sum();
        double entropyInCluster = lbCounters.values().stream().mapToDouble(AtomicInteger::get).map(lblsCount -> lblsCount / sizeOfCluster).map(lblProb -> -lblProb * Math.log(lblProb)).sum();
        sumOfClusterEntropies += entropyInCluster;
    }
    return sumOfClusterEntropies / clusterUniqueLbCounts.size();
}
Also used : IgniteBiPredicate(org.apache.ignite.lang.IgniteBiPredicate) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) IOException(java.io.IOException) HashMap(java.util.HashMap) Ignite(org.apache.ignite.Ignite) IgniteCache(org.apache.ignite.IgniteCache) DummyVectorizer(org.apache.ignite.ml.dataset.feature.extractor.impl.DummyVectorizer) KMeansTrainer(org.apache.ignite.ml.clustering.kmeans.KMeansTrainer) Ignition(org.apache.ignite.Ignition) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) MLSandboxDatasets(org.apache.ignite.examples.ml.util.MLSandboxDatasets) TrainTestDatasetSplitter(org.apache.ignite.ml.selection.split.TrainTestDatasetSplitter) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SandboxMLCache(org.apache.ignite.examples.ml.util.SandboxMLCache) KMeansModel(org.apache.ignite.ml.clustering.kmeans.KMeansModel) Map(java.util.Map) QueryCursor(org.apache.ignite.cache.query.QueryCursor) EuclideanDistance(org.apache.ignite.ml.math.distances.EuclideanDistance) TrainTestSplit(org.apache.ignite.ml.selection.split.TrainTestSplit) Cache(javax.cache.Cache) ScanQuery(org.apache.ignite.cache.query.ScanQuery) LearningEnvironmentBuilder(org.apache.ignite.ml.environment.LearningEnvironmentBuilder) Vectorizer(org.apache.ignite.ml.dataset.feature.extractor.Vectorizer) HashMap(java.util.HashMap) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HashMap(java.util.HashMap) Map(java.util.Map) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) IgniteCache(org.apache.ignite.IgniteCache) SandboxMLCache(org.apache.ignite.examples.ml.util.SandboxMLCache) Cache(javax.cache.Cache)

Example 8 with KMeansModel

use of org.apache.ignite.ml.clustering.kmeans.KMeansModel in project ignite by apache.

the class KMeansClusterizationExample method main.

/**
 * Run example.
 */
public static void main(String[] args) throws IOException {
    System.out.println();
    System.out.println(">>> KMeans clustering algorithm over cached dataset usage example started.");
    // Start ignite grid.
    try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
        System.out.println(">>> Ignite grid started.");
        IgniteCache<Integer, Vector> dataCache = null;
        try {
            dataCache = new SandboxMLCache(ignite).fillCacheWith(MLSandboxDatasets.TWO_CLASSED_IRIS);
            Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>().labeled(Vectorizer.LabelCoordinate.FIRST);
            KMeansTrainer trainer = new KMeansTrainer();
            KMeansModel mdl = trainer.fit(ignite, dataCache, vectorizer);
            System.out.println(">>> KMeans centroids");
            Tracer.showAscii(mdl.centers()[0]);
            Tracer.showAscii(mdl.centers()[1]);
            System.out.println(">>>");
            System.out.println(">>> --------------------------------------------");
            System.out.println(">>> | Predicted cluster\t| Erased class label\t|");
            System.out.println(">>> --------------------------------------------");
            try (QueryCursor<Cache.Entry<Integer, Vector>> observations = dataCache.query(new ScanQuery<>())) {
                for (Cache.Entry<Integer, Vector> observation : observations) {
                    Vector val = observation.getValue();
                    Vector inputs = val.copyOfRange(1, val.size());
                    double groundTruth = val.get(0);
                    double prediction = mdl.predict(inputs);
                    System.out.printf(">>> | %.4f\t\t\t| %.4f\t\t|\n", prediction, groundTruth);
                }
                System.out.println(">>> ---------------------------------");
                System.out.println(">>> KMeans clustering algorithm over cached dataset usage example completed.");
            }
        } finally {
            if (dataCache != null)
                dataCache.destroy();
        }
    } finally {
        System.out.flush();
    }
}
Also used : SandboxMLCache(org.apache.ignite.examples.ml.util.SandboxMLCache) KMeansModel(org.apache.ignite.ml.clustering.kmeans.KMeansModel) KMeansTrainer(org.apache.ignite.ml.clustering.kmeans.KMeansTrainer) Ignite(org.apache.ignite.Ignite) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) IgniteCache(org.apache.ignite.IgniteCache) SandboxMLCache(org.apache.ignite.examples.ml.util.SandboxMLCache) Cache(javax.cache.Cache)

Example 9 with KMeansModel

use of org.apache.ignite.ml.clustering.kmeans.KMeansModel in project ignite by apache.

the class KMeansFromSparkExample method main.

/**
 * Run example.
 */
public static void main(String[] args) throws FileNotFoundException {
    System.out.println();
    System.out.println(">>> K-means model loaded from Spark through serialization over partitioned dataset usage example started.");
    // Start ignite grid.
    try (Ignite ignite = Ignition.start("examples/config/example-ignite.xml")) {
        System.out.println(">>> Ignite grid started.");
        IgniteCache<Integer, Vector> dataCache = null;
        try {
            dataCache = TitanicUtils.readPassengers(ignite);
            final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>(0, 5, 6, 4).labeled(1);
            KMeansModel mdl = (KMeansModel) SparkModelParser.parse(SPARK_MDL_PATH, SupportedSparkModels.KMEANS, env);
            System.out.println(">>> K-Means model: " + mdl);
            System.out.println(">>> ------------------------------------");
            System.out.println(">>> | Predicted cluster\t| Is survived\t|");
            System.out.println(">>> ------------------------------------");
            try (QueryCursor<Cache.Entry<Integer, Vector>> observations = dataCache.query(new ScanQuery<>())) {
                for (Cache.Entry<Integer, Vector> observation : observations) {
                    LabeledVector<Double> lv = vectorizer.apply(observation.getKey(), observation.getValue());
                    Vector inputs = lv.features();
                    double isSurvived = lv.label();
                    double clusterId = mdl.predict(inputs);
                    System.out.printf(">>> | %.4f\t\t| %.4f\t\t|\n", clusterId, isSurvived);
                }
            }
            System.out.println(">>> ---------------------------------");
        } finally {
            dataCache.destroy();
        }
    }
}
Also used : KMeansModel(org.apache.ignite.ml.clustering.kmeans.KMeansModel) Ignite(org.apache.ignite.Ignite) Vector(org.apache.ignite.ml.math.primitives.vector.Vector) LabeledVector(org.apache.ignite.ml.structures.LabeledVector) IgniteCache(org.apache.ignite.IgniteCache) Cache(javax.cache.Cache)

Example 10 with KMeansModel

use of org.apache.ignite.ml.clustering.kmeans.KMeansModel in project ignite by apache.

the class LocalModelsTest method importExportKMeansModelTest.

/**
 */
@Test
public void importExportKMeansModelTest() throws IOException {
    executeModelTest(mdlFilePath -> {
        KMeansModel mdl = getClusterModel();
        Exporter<KMeansModelFormat, String> exporter = new FileExporter<>();
        mdl.saveModel(exporter, mdlFilePath);
        KMeansModelFormat load = exporter.load(mdlFilePath);
        Assert.assertNotNull(load);
        KMeansModel importedMdl = new KMeansModel(load.getCenters(), load.getDistance());
        Assert.assertEquals("", mdl, importedMdl);
        return null;
    });
}
Also used : KMeansModel(org.apache.ignite.ml.clustering.kmeans.KMeansModel) FileExporter(org.apache.ignite.ml.FileExporter) KMeansModelFormat(org.apache.ignite.ml.clustering.kmeans.KMeansModelFormat) Test(org.junit.Test)

Aggregations

KMeansModel (org.apache.ignite.ml.clustering.kmeans.KMeansModel)14 Vector (org.apache.ignite.ml.math.primitives.vector.Vector)10 KMeansTrainer (org.apache.ignite.ml.clustering.kmeans.KMeansTrainer)9 Ignite (org.apache.ignite.Ignite)6 Test (org.junit.Test)6 EuclideanDistance (org.apache.ignite.ml.math.distances.EuclideanDistance)5 DenseVector (org.apache.ignite.ml.math.primitives.vector.impl.DenseVector)5 SandboxMLCache (org.apache.ignite.examples.ml.util.SandboxMLCache)4 LabeledVector (org.apache.ignite.ml.structures.LabeledVector)4 Cache (javax.cache.Cache)3 IgniteCache (org.apache.ignite.IgniteCache)3 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 BinaryObject (org.apache.ignite.binary.BinaryObject)2 KMeansModelFormat (org.apache.ignite.ml.clustering.kmeans.KMeansModelFormat)2 TrainerTest (org.apache.ignite.ml.common.TrainerTest)2 Path (java.nio.file.Path)1 Map (java.util.Map)1 Configuration (org.apache.hadoop.conf.Configuration)1