Search in sources :

Example 1 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluateVarianceRatioCriteria method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return Variance Ratio Criteria
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    // FIXME: allow using a precomputed distance matrix!
    final SquaredEuclideanDistanceFunction df = SquaredEuclideanDistanceFunction.STATIC;
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    double vrc = 0.;
    int ignorednoise = 0;
    if (clusters.size() > 1) {
        NumberVector[] centroids = new NumberVector[clusters.size()];
        ignorednoise = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseOption);
        // Build global centroid and cluster count:
        final int dim = RelationUtil.dimensionality(rel);
        Centroid overallCentroid = new Centroid(dim);
        int clustercount = globalCentroid(overallCentroid, rel, clusters, centroids, noiseOption);
        // a: Distance to own centroid
        // b: Distance to overall centroid
        double a = 0, b = 0;
        Iterator<? extends Cluster<?>> ci = clusters.iterator();
        for (int i = 0; ci.hasNext(); i++) {
            Cluster<?> cluster = ci.next();
            if (cluster.size() <= 1 || cluster.isNoise()) {
                switch(noiseOption) {
                    case IGNORE_NOISE:
                        // Ignored
                        continue;
                    case TREAT_NOISE_AS_SINGLETONS:
                        // Singletons: a = 0 by definition.
                        for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
                            b += df.distance(overallCentroid, rel.get(it));
                        }
                        // with NEXT cluster.
                        continue;
                    case MERGE_NOISE:
                        // Treat like a cluster below:
                        break;
                }
            }
            for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
                NumberVector vec = rel.get(it);
                a += df.distance(centroids[i], vec);
                b += df.distance(overallCentroid, vec);
            }
        }
        vrc = ((b - a) / a) * ((rel.size() - clustercount) / (clustercount - 1.));
        // Only if {@link NoiseHandling#IGNORE_NOISE}:
        if (penalize && ignorednoise > 0) {
            vrc *= (rel.size() - ignorednoise) / (double) rel.size();
        }
    }
    if (LOG.isStatistics()) {
        LOG.statistics(new StringStatistic(key + ".vrc.noise-handling", noiseOption.toString()));
        if (ignorednoise > 0) {
            LOG.statistics(new LongStatistic(key + ".vrc.ignored", ignorednoise));
        }
        LOG.statistics(new DoubleStatistic(key + ".vrc", vrc));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Variance Ratio Criteria", vrc, 0., 1., 0., false);
    return vrc;
}
Also used : MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) StringStatistic(de.lmu.ifi.dbs.elki.logging.statistics.StringStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) SquaredEuclideanDistanceFunction(de.lmu.ifi.dbs.elki.distance.distancefunction.minkowski.SquaredEuclideanDistanceFunction) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)

Example 2 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluateConcordantPairs method computeWithinDistances.

protected double[] computeWithinDistances(Relation<? extends NumberVector> rel, List<? extends Cluster<?>> clusters, int withinPairs) {
    double[] concordant = new double[withinPairs];
    int i = 0;
    for (Cluster<?> cluster : clusters) {
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseHandling) {
                case IGNORE_NOISE:
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    // No concordant distances.
                    continue;
                case MERGE_NOISE:
                    // Treat like a cluster below.
                    break;
            }
        }
        for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
            NumberVector obj = rel.get(it1);
            for (DBIDIter it2 = cluster.getIDs().iter(); it2.valid(); it2.advance()) {
                if (DBIDUtil.compare(it1, it2) <= 0) {
                    continue;
                }
                concordant[i++] = distanceFunction.distance(obj, rel.get(it2));
            }
        }
    }
    assert (concordant.length == i);
    Arrays.sort(concordant);
    return concordant;
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 3 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluateDaviesBouldin method withinGroupDistances.

public double[] withinGroupDistances(Relation<? extends NumberVector> rel, List<? extends Cluster<?>> clusters, NumberVector[] centroids) {
    double[] withinGroupDists = new double[clusters.size()];
    Iterator<? extends Cluster<?>> ci = clusters.iterator();
    for (int i = 0; ci.hasNext(); i++) {
        Cluster<?> cluster = ci.next();
        NumberVector centroid = centroids[i];
        if (centroid == null) {
            // Empty, noise or singleton cluster:
            withinGroupDists[i] = 0.;
            continue;
        }
        double wD = 0.;
        for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
            wD += distanceFunction.distance(centroid, rel.get(it));
        }
        withinGroupDists[i] = wD / cluster.size();
    }
    return withinGroupDists;
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 4 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class ArffParserTest method sparse.

@Test
public void sparse() throws IOException {
    String filename = UNITTEST + "parsertest.sparse.arff";
    Parser parser = new ELKIBuilder<>(ArffParser.class).build();
    MultipleObjectsBundle bundle;
    try (InputStream is = open(filename);
        InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, null, parser)) {
        bundle = dbc.loadData();
    }
    // Ensure that the filter has correctly formed the bundle.
    // We expect that the bundle's first column is a number vector field.
    // We expect that the bundle's second column is a LabelList
    // Ensure the first column are the vectors.
    assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(bundle.meta(0)));
    assertTrue("Test file not as expected", TypeUtil.CLASSLABEL.isAssignableFromType(bundle.meta(1)));
    assertEquals("Length", 2, bundle.dataLength());
    assertEquals("Length", 4, ((NumberVector) bundle.data(0, 0)).getDimensionality());
    // Sparse missing values are supposed to be 0.
    NumberVector nv = (NumberVector) bundle.data(1, 0);
    assertEquals("Not 0 for missing data", 0., nv.doubleValue(0), 0.);
    assertEquals("Not 0 for missing data", 0., nv.doubleValue(2), 0.);
    // Ensure that the third column are the LabelList objects.
    assertEquals("Unexpected data type", SparseDoubleVector.class, bundle.data(0, 0).getClass());
    assertEquals("Unexpected data type", SimpleClassLabel.class, bundle.data(0, 1).getClass());
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) InputStream(java.io.InputStream) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) InputStreamDatabaseConnection(de.lmu.ifi.dbs.elki.datasource.InputStreamDatabaseConnection) Test(org.junit.Test) AbstractDataSourceTest(de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)

Example 5 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class GeoIndexing method main.

public static void main(String[] args) {
    // Set the logging level to statistics:
    LoggingConfiguration.setStatistics();
    // Generate a random data set.
    Random rand = new Random(0L);
    // Note: ELKI has a nice data generator class, use that instead.
    double[][] data = new double[100000][];
    for (int i = 0; i < data.length; i++) {
        data[i] = randomLatitudeLongitude(rand);
    }
    // Adapter to load data from an existing array.
    DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data);
    // Since the R-tree has so many options, it is a bit easier to configure it
    // using the parameterization API, which handles defaults, instantiation,
    // and additional constraint checks.
    RStarTreeFactory<?> indexfactory = // 
    new ELKIBuilder<>(RStarTreeFactory.class).with(AbstractPageFileFactory.Parameterizer.PAGE_SIZE_ID, // 
    512).with(RStarTreeFactory.Parameterizer.BULK_SPLIT_ID, // 
    SortTileRecursiveBulkSplit.class).build();
    // Create the database, and initialize it.
    Database db = new StaticArrayDatabase(dbc, Arrays.asList(indexfactory));
    // This will build the index of the database.
    db.initialize();
    // Relation containing the number vectors we put in above:
    Relation<NumberVector> rel = db.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
    // We can use this to identify rows of the input data below.
    DBIDRange ids = (DBIDRange) rel.getDBIDs();
    // For all indexes, dump their statistics.
    for (It<Index> it = db.getHierarchy().iterDescendants(db).filter(Index.class); it.valid(); it.advance()) {
        it.get().logStatistics();
    }
    // We use the WGS84 earth model, and "latitude, longitude" coordinates:
    // This distance function returns meters.
    LatLngDistanceFunction df = new LatLngDistanceFunction(WGS84SpheroidEarthModel.STATIC);
    // k nearest neighbor query:
    KNNQuery<NumberVector> knnq = QueryUtil.getKNNQuery(rel, df);
    // Let's find the closest points to New York:
    DoubleVector newYork = DoubleVector.wrap(new double[] { 40.730610, -73.935242 });
    KNNList knns = knnq.getKNNForObject(newYork, 10);
    // Iterate over all results.
    System.out.println("Close to New York:");
    for (DoubleDBIDListIter it = knns.iter(); it.valid(); it.advance()) {
        // To kilometers
        double km = it.doubleValue() / 1000;
        System.out.println(rel.get(it) + " distance: " + km + " km row: " + ids.getOffset(it));
    }
    // Many other indexes will fail if we search close to the date line.
    DoubleVector tuvalu = DoubleVector.wrap(new double[] { -7.4784205, 178.679924 });
    knns = knnq.getKNNForObject(tuvalu, 10);
    // Iterate over all results.
    System.out.println("Close to Tuvalu:");
    for (DoubleDBIDListIter it = knns.iter(); it.valid(); it.advance()) {
        // To kilometers
        double km = it.doubleValue() / 1000;
        System.out.println(rel.get(it) + " distance: " + km + " km row: " + ids.getOffset(it));
    }
    // the distances to a few points in the data set.
    for (It<Index> it = db.getHierarchy().iterDescendants(db).filter(Index.class); it.valid(); it.advance()) {
        it.get().logStatistics();
    }
}
Also used : DoubleDBIDListIter(de.lmu.ifi.dbs.elki.database.ids.DoubleDBIDListIter) ArrayAdapterDatabaseConnection(de.lmu.ifi.dbs.elki.datasource.ArrayAdapterDatabaseConnection) SortTileRecursiveBulkSplit(de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.strategies.bulk.SortTileRecursiveBulkSplit) Index(de.lmu.ifi.dbs.elki.index.Index) LatLngDistanceFunction(de.lmu.ifi.dbs.elki.distance.distancefunction.geo.LatLngDistanceFunction) Random(java.util.Random) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) KNNList(de.lmu.ifi.dbs.elki.database.ids.KNNList) Database(de.lmu.ifi.dbs.elki.database.Database) StaticArrayDatabase(de.lmu.ifi.dbs.elki.database.StaticArrayDatabase) DBIDRange(de.lmu.ifi.dbs.elki.database.ids.DBIDRange) ArrayAdapterDatabaseConnection(de.lmu.ifi.dbs.elki.datasource.ArrayAdapterDatabaseConnection) DatabaseConnection(de.lmu.ifi.dbs.elki.datasource.DatabaseConnection) DoubleVector(de.lmu.ifi.dbs.elki.data.DoubleVector) RStarTreeFactory(de.lmu.ifi.dbs.elki.index.tree.spatial.rstarvariants.rstar.RStarTreeFactory) StaticArrayDatabase(de.lmu.ifi.dbs.elki.database.StaticArrayDatabase)

Aggregations

NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)85 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)40 ArrayList (java.util.ArrayList)16 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)9 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)8 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)8 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)8 Database (de.lmu.ifi.dbs.elki.database.Database)7 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)7 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)7 Random (java.util.Random)7 Test (org.junit.Test)7 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)5 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)5 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)5 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)5 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)5 List (java.util.List)5 SparseNumberVector (de.lmu.ifi.dbs.elki.data.SparseNumberVector)4 RandomProjectionFamily (de.lmu.ifi.dbs.elki.data.projection.random.RandomProjectionFamily)4