use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class EvaluateSquaredErrors method evaluateClustering.
/**
* Evaluate a single clustering.
*
* @param db Database
* @param rel Data relation
* @param c Clustering
* @return ssq
*/
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
boolean square = !distance.isSquared();
int ignorednoise = 0;
List<? extends Cluster<?>> clusters = c.getAllClusters();
double ssq = 0, sum = 0;
for (Cluster<?> cluster : clusters) {
if (cluster.size() <= 1 || cluster.isNoise()) {
switch(noiseOption) {
case IGNORE_NOISE:
ignorednoise += cluster.size();
continue;
case TREAT_NOISE_AS_SINGLETONS:
continue;
case MERGE_NOISE:
// Treat as cluster below:
break;
}
}
NumberVector center = ModelUtil.getPrototypeOrCentroid(cluster.getModel(), rel, cluster.getIDs());
for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
final double d = distance.distance(center, rel.get(it1));
sum += d;
ssq += square ? d * d : d;
}
}
final int div = Math.max(1, rel.size() - ignorednoise);
if (LOG.isStatistics()) {
LOG.statistics(new DoubleStatistic(key + ".mean", sum / div));
LOG.statistics(new DoubleStatistic(key + ".ssq", ssq));
LOG.statistics(new DoubleStatistic(key + ".rmsd", FastMath.sqrt(ssq / div)));
}
EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
g.addMeasure("Mean distance", sum / div, 0., Double.POSITIVE_INFINITY, true);
g.addMeasure("Sum of Squares", ssq, 0., Double.POSITIVE_INFINITY, true);
g.addMeasure("RMSD", FastMath.sqrt(ssq / div), 0., Double.POSITIVE_INFINITY, true);
db.getHierarchy().add(c, ev);
return ssq;
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class RandomProjectedNeighborsAndDensities method computeSetsBounds.
/**
* Create random projections, project points and put points into sets of size
* about minSplitSize/2
*
* @param points points to process
* @param minSplitSize minimum size for which a point set is further
* partitioned (roughly corresponds to minPts in OPTICS)
* @param ptList Points that are to be projected
*/
public void computeSetsBounds(Relation<V> points, int minSplitSize, DBIDs ptList) {
this.minSplitSize = minSplitSize;
final int size = points.size();
final int dim = RelationUtil.dimensionality(points);
this.points = points;
// perform O(log N+log dim) splits of the entire point sets projections
int nPointSetSplits = (int) (logOProjectionConst * MathUtil.log2(size * dim + 1));
// perform O(log N+log dim) projections of the point set onto a random line
int nProject1d = (int) (logOProjectionConst * MathUtil.log2(size * dim + 1));
LOG.statistics(new LongStatistic(PREFIX + ".partition-size", nPointSetSplits));
LOG.statistics(new LongStatistic(PREFIX + ".num-projections", nProject1d));
splitsets = new ArrayList<>();
// perform projections of points
projectedPoints = new DoubleDataStore[nProject1d];
DoubleDataStore[] tmpPro = new DoubleDataStore[nProject1d];
Random rand = rnd.getSingleThreadedRandom();
FiniteProgress projp = LOG.isVerbose() ? new FiniteProgress("Random projections", nProject1d, LOG) : null;
for (int j = 0; j < nProject1d; j++) {
double[] currRp = new double[dim];
double sum = 0;
for (int i = 0; i < dim; i++) {
double fl = rand.nextDouble() - 0.5;
currRp[i] = fl;
sum += fl * fl;
}
sum = FastMath.sqrt(sum);
for (int i = 0; i < dim; i++) {
currRp[i] /= sum;
}
WritableDoubleDataStore currPro = DataStoreUtil.makeDoubleStorage(ptList, DataStoreFactory.HINT_HOT);
for (DBIDIter it = ptList.iter(); it.valid(); it.advance()) {
NumberVector vecPt = points.get(it);
// Dot product:
double sum2 = 0;
for (int i = 0; i < dim; i++) {
sum2 += currRp[i] * vecPt.doubleValue(i);
}
currPro.put(it, sum2);
}
projectedPoints[j] = currPro;
LOG.incrementProcessed(projp);
}
LOG.ensureCompleted(projp);
// Log the number of scalar projections performed.
long numprod = nProject1d * (long) ptList.size();
LOG.statistics(new LongStatistic(PREFIX + ".num-scalar-products", numprod));
// split entire point set, reuse projections by shuffling them
IntArrayList proind = new IntArrayList(nProject1d);
for (int j = 0; j < nProject1d; j++) {
proind.add(j);
}
FiniteProgress splitp = LOG.isVerbose() ? new FiniteProgress("Splitting data", nPointSetSplits, LOG) : null;
for (int avgP = 0; avgP < nPointSetSplits; avgP++) {
// shuffle projections
for (int i = 0; i < nProject1d; i++) {
tmpPro[i] = projectedPoints[i];
}
// Shuffle axes (Fisher-Yates)
for (int i = 1; i < nProject1d; i++) {
final int j = rand.nextInt(i);
// Swap i,j
proind.set(i, proind.set(j, proind.getInt(i)));
}
IntIterator it = proind.iterator();
int i = 0;
while (it.hasNext()) {
int cind = it.nextInt();
projectedPoints[cind] = tmpPro[i];
i++;
}
// split point set
splitupNoSort(DBIDUtil.newArray(ptList), 0, size, 0, rand);
LOG.incrementProcessed(splitp);
}
LOG.ensureCompleted(splitp);
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class ArffParserTest method dense.
@Test
public void dense() throws IOException {
String filename = UNITTEST + "parsertest.arff";
Parser parser = new ELKIBuilder<>(ArffParser.class).build();
MultipleObjectsBundle bundle;
try (InputStream is = open(filename);
InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, null, parser)) {
bundle = dbc.loadData();
}
// Ensure that the filter has correctly formed the bundle.
// We expect that the bundle's first column is a number vector field.
// We expect that the bundle's second column is a LabelList
// Ensure the first column are the vectors.
assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(bundle.meta(0)));
assertTrue("Test file not as expected", TypeUtil.CLASSLABEL.isAssignableFromType(bundle.meta(1)));
assertTrue("Test file not as expected", TypeUtil.LABELLIST.isAssignableFromType(bundle.meta(2)));
assertTrue("Test file not as expected", TypeUtil.EXTERNALID.isAssignableFromType(bundle.meta(3)));
assertEquals("Length", 11, bundle.dataLength());
assertEquals("Length", 4, ((NumberVector) bundle.data(0, 0)).getDimensionality());
// Dense missing values are supposed to be NaN
NumberVector nv = (NumberVector) bundle.data(10, 0);
assertTrue("Expected NaN for missing data", Double.isNaN(nv.doubleValue(1)));
assertTrue("Expected NaN for missing data", Double.isNaN(nv.doubleValue(3)));
// Ensure that the third column are the LabelList objects.
assertEquals("Unexpected data type", DoubleVector.class, bundle.data(0, 0).getClass());
assertEquals("Unexpected data type", SimpleClassLabel.class, bundle.data(0, 1).getClass());
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class RelationUtil method relationAsMatrix.
/**
* <em>Copy</em> a relation into a double matrix.
*
* This is <em>not recommended</em> unless you need to modify the data
* temporarily.
*
* @param relation Relation
* @param ids IDs, with well-defined order (i.e. array)
* @return Data matrix
*/
public static double[][] relationAsMatrix(final Relation<? extends NumberVector> relation, ArrayDBIDs ids) {
final int rowdim = ids.size();
final int coldim = dimensionality(relation);
double[][] mat = new double[rowdim][coldim];
int r = 0;
for (DBIDArrayIter iter = ids.iter(); iter.valid(); iter.advance(), r++) {
NumberVector vec = relation.get(iter);
double[] row = mat[r];
for (int c = 0; c < coldim; c++) {
row[c] = vec.doubleValue(c);
}
}
assert (r == rowdim);
return mat;
}
use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.
the class WeightedCovarianceMatrixBuilder method processIds.
/**
* Weighted Covariance Matrix for a set of IDs. Since we are not supplied any
* distance information, we'll need to compute it ourselves. Covariance is
* tied to Euclidean distance, so it probably does not make much sense to add
* support for other distance functions?
*
* @param ids Database ids to process
* @param relation Relation to process
* @return Covariance matrix
*/
@Override
public double[][] processIds(DBIDs ids, Relation<? extends NumberVector> relation) {
final int dim = RelationUtil.dimensionality(relation);
final CovarianceMatrix cmat = new CovarianceMatrix(dim);
final Centroid centroid = Centroid.make(relation, ids);
// find maximum distance
double maxdist = 0.0, stddev = 0.0;
{
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
double distance = weightDistance.distance(centroid, relation.get(iter));
stddev += distance * distance;
if (distance > maxdist) {
maxdist = distance;
}
}
if (maxdist == 0.0) {
maxdist = 1.0;
}
// compute standard deviation.
stddev = FastMath.sqrt(stddev / ids.size());
}
for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
NumberVector obj = relation.get(iter);
double distance = weightDistance.distance(centroid, obj);
double weight = weightfunction.getWeight(distance, maxdist, stddev);
cmat.put(obj, weight);
}
return cmat.destroyToPopulationMatrix();
}
Aggregations