use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class EvaluateVarianceRatioCriteria method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return Variance Ratio Criteria
*/
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
// FIXME: allow using a precomputed distance matrix!
final SquaredEuclideanDistanceFunction df = SquaredEuclideanDistanceFunction.STATIC;
List<? extends Cluster<?>> clusters = c.getAllClusters();
double vrc = 0.;
int ignorednoise = 0;
if (clusters.size() > 1) {
NumberVector[] centroids = new NumberVector[clusters.size()];
ignorednoise = EvaluateSimplifiedSilhouette.centroids(rel, clusters, centroids, noiseOption);
// Build global centroid and cluster count:
final int dim = RelationUtil.dimensionality(rel);
Centroid overallCentroid = new Centroid(dim);
int clustercount = globalCentroid(overallCentroid, rel, clusters, centroids, noiseOption);
// a: Distance to own centroid
// b: Distance to overall centroid
double a = 0, b = 0;
Iterator<? extends Cluster<?>> ci = clusters.iterator();
for (int i = 0; ci.hasNext(); i++) {
Cluster<?> cluster = ci.next();
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
// Ignored
continue;
case TREAT_NOISE_AS_SINGLETONS:
// Singletons: a = 0 by definition.
for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
b += df.distance(overallCentroid, rel.get(it));
}
// with NEXT cluster.
continue;
case MERGE_NOISE:
// Treat like a cluster below:
break;
}
}
for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
NumberVector vec = rel.get(it);
a += df.distance(centroids[i], vec);
b += df.distance(overallCentroid, vec);
}
}
vrc = ((b - a) / a) * ((rel.size() - clustercount) / (clustercount - 1.));
// Only if {@link NoiseHandling#IGNORE_NOISE}:
if (penalize && ignorednoise > 0) {
vrc *= (rel.size() - ignorednoise) / (double) rel.size();
}
}
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(key + ".vrc.noise-handling", noiseOption.toString()));
if (ignorednoise > 0) {
LOG.statistics(new LongStatistic(key + ".vrc.ignored", ignorednoise));
}
LOG.statistics(new DoubleStatistic(key + ".vrc", vrc));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Variance Ratio Criteria", vrc, 0., 1., 0., false);
return vrc;
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class EvaluateConcordantPairs method computeWithinDistances.
protected double[] computeWithinDistances(Relation<? extends NumberVector> rel, List<? extends Cluster<?>> clusters, int withinPairs) {
double[] concordant = new double[withinPairs];
int i = 0;
for (Cluster<?> cluster : clusters) {
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseHandling) {
case IGNORE_NOISE:
continue;
case TREAT_NOISE_AS_SINGLETONS:
// No concordant distances.
continue;
case MERGE_NOISE:
// Treat like a cluster below.
break;
}
}
for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
NumberVector obj = rel.get(it1);
for (DBIDIter it2 = cluster.getIDs().iter(); it2.valid(); it2.advance()) {
if (DBIDUtil.compare(it1, it2) <= 0) {
continue;
}
concordant[i++] = distanceFunction.distance(obj, rel.get(it2));
}
}
}
assert (concordant.length == i);
Arrays.sort(concordant);
return concordant;
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class EvaluateDaviesBouldin method withinGroupDistances.
public double[] withinGroupDistances(Relation<? extends NumberVector> rel, List<? extends Cluster<?>> clusters, NumberVector[] centroids) {
double[] withinGroupDists = new double[clusters.size()];
Iterator<? extends Cluster<?>> ci = clusters.iterator();
for (int i = 0; ci.hasNext(); i++) {
Cluster<?> cluster = ci.next();
NumberVector centroid = centroids[i];
if (centroid == null) {
// Empty, noise or singleton cluster:
withinGroupDists[i] = 0.;
continue;
}
double wD = 0.;
for (DBIDIter it = cluster.getIDs().iter(); it.valid(); it.advance()) {
wD += distanceFunction.distance(centroid, rel.get(it));
}
withinGroupDists[i] = wD / cluster.size();
}
return withinGroupDists;
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class ArffParserTest method sparse.
@Test
public void sparse() throws IOException {
String filename = UNITTEST + "parsertest.sparse.arff";
Parser parser = new ELKIBuilder<>(ArffParser.class).build();
MultipleObjectsBundle bundle;
try (InputStream is = open(filename);
InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, null, parser)) {
bundle = dbc.loadData();
}
// Ensure that the filter has correctly formed the bundle.
// We expect that the bundle's first column is a number vector field.
// We expect that the bundle's second column is a LabelList
// Ensure the first column are the vectors.
assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(bundle.meta(0)));
assertTrue("Test file not as expected", TypeUtil.CLASSLABEL.isAssignableFromType(bundle.meta(1)));
assertEquals("Length", 2, bundle.dataLength());
assertEquals("Length", 4, ((NumberVector) bundle.data(0, 0)).getDimensionality());
// Sparse missing values are supposed to be 0.
NumberVector nv = (NumberVector) bundle.data(1, 0);
assertEquals("Not 0 for missing data", 0., nv.doubleValue(0), 0.);
assertEquals("Not 0 for missing data", 0., nv.doubleValue(2), 0.);
// Ensure that the third column are the LabelList objects.
assertEquals("Unexpected data type", SparseDoubleVector.class, bundle.data(0, 0).getClass());
assertEquals("Unexpected data type", SimpleClassLabel.class, bundle.data(0, 1).getClass());
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class GeoIndexing method main.
public static void main(String[] args) {
// Set the logging level to statistics:
LoggingConfiguration.setStatistics();
// Generate a random data set.
Random rand = new Random(0L);
// Note: ELKI has a nice data generator class, use that instead.
double[][] data = new double[100000][];
for (int i = 0; i < data.length; i++) {
data[i] = randomLatitudeLongitude(rand);
}
// Adapter to load data from an existing array.
DatabaseConnection dbc = new ArrayAdapterDatabaseConnection(data);
// Since the R-tree has so many options, it is a bit easier to configure it
// using the parameterization API, which handles defaults, instantiation,
// and additional constraint checks.
RStarTreeFactory<?> indexfactory = //
new ELKIBuilder<>(RStarTreeFactory.class).with(AbstractPageFileFactory.Parameterizer.PAGE_SIZE_ID, //
512).with(RStarTreeFactory.Parameterizer.BULK_SPLIT_ID, //
SortTileRecursiveBulkSplit.class).build();
// Create the database, and initialize it.
Database db = new StaticArrayDatabase(dbc, Arrays.asList(indexfactory));
// This will build the index of the database.
db.initialize();
// Relation containing the number vectors we put in above:
Relation<NumberVector> rel = db.getRelation(TypeUtil.NUMBER_VECTOR_FIELD);
// We can use this to identify rows of the input data below.
DBIDRange ids = (DBIDRange) rel.getDBIDs();
// For all indexes, dump their statistics.
for (It<Index> it = db.getHierarchy().iterDescendants(db).filter(Index.class); it.valid(); it.advance()) {
it.get().logStatistics();
}
// We use the WGS84 earth model, and "latitude, longitude" coordinates:
// This distance function returns meters.
LatLngDistanceFunction df = new LatLngDistanceFunction(WGS84SpheroidEarthModel.STATIC);
// k nearest neighbor query:
KNNQuery<NumberVector> knnq = QueryUtil.getKNNQuery(rel, df);
// Let's find the closest points to New York:
DoubleVector newYork = DoubleVector.wrap(new double[] { 40.730610, -73.935242 });
KNNList knns = knnq.getKNNForObject(newYork, 10);
// Iterate over all results.
System.out.println("Close to New York:");
for (DoubleDBIDListIter it = knns.iter(); it.valid(); it.advance()) {
// To kilometers
double km = it.doubleValue() / 1000;
System.out.println(rel.get(it) + " distance: " + km + " km row: " + ids.getOffset(it));
}
// Many other indexes will fail if we search close to the date line.
DoubleVector tuvalu = DoubleVector.wrap(new double[] { -7.4784205, 178.679924 });
knns = knnq.getKNNForObject(tuvalu, 10);
// Iterate over all results.
System.out.println("Close to Tuvalu:");
for (DoubleDBIDListIter it = knns.iter(); it.valid(); it.advance()) {
// To kilometers
double km = it.doubleValue() / 1000;
System.out.println(rel.get(it) + " distance: " + km + " km row: " + ids.getOffset(it));
}
// the distances to a few points in the data set.
for (It<Index> it = db.getHierarchy().iterDescendants(db).filter(Index.class); it.valid(); it.advance()) {
it.get().logStatistics();
}
}
Aggregations