Search in sources :

Example 66 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class EvaluateSquaredErrors method evaluateClustering.

/**
 * Evaluate a single clustering.
 *
 * @param db Database
 * @param rel Data relation
 * @param c Clustering
 * @return ssq
 */
public double evaluateClustering(Database db, Relation<? extends NumberVector> rel, Clustering<?> c) {
    boolean square = !distance.isSquared();
    int ignorednoise = 0;
    List<? extends Cluster<?>> clusters = c.getAllClusters();
    double ssq = 0, sum = 0;
    for (Cluster<?> cluster : clusters) {
        if (cluster.size() <= 1 || cluster.isNoise()) {
            switch(noiseOption) {
                case IGNORE_NOISE:
                    ignorednoise += cluster.size();
                    continue;
                case TREAT_NOISE_AS_SINGLETONS:
                    continue;
                case MERGE_NOISE:
                    // Treat as cluster below:
                    break;
            }
        }
        NumberVector center = ModelUtil.getPrototypeOrCentroid(cluster.getModel(), rel, cluster.getIDs());
        for (DBIDIter it1 = cluster.getIDs().iter(); it1.valid(); it1.advance()) {
            final double d = distance.distance(center, rel.get(it1));
            sum += d;
            ssq += square ? d * d : d;
        }
    }
    final int div = Math.max(1, rel.size() - ignorednoise);
    if (LOG.isStatistics()) {
        LOG.statistics(new DoubleStatistic(key + ".mean", sum / div));
        LOG.statistics(new DoubleStatistic(key + ".ssq", ssq));
        LOG.statistics(new DoubleStatistic(key + ".rmsd", FastMath.sqrt(ssq / div)));
    }
    EvaluationResult ev = EvaluationResult.findOrCreate(db.getHierarchy(), c, "Internal Clustering Evaluation", "internal evaluation");
    MeasurementGroup g = ev.findOrCreateGroup("Distance-based Evaluation");
    g.addMeasure("Mean distance", sum / div, 0., Double.POSITIVE_INFINITY, true);
    g.addMeasure("Sum of Squares", ssq, 0., Double.POSITIVE_INFINITY, true);
    g.addMeasure("RMSD", FastMath.sqrt(ssq / div), 0., Double.POSITIVE_INFINITY, true);
    db.getHierarchy().add(c, ev);
    return ssq;
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) MeasurementGroup(de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup) EvaluationResult(de.lmu.ifi.dbs.elki.result.EvaluationResult) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Example 67 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class RandomProjectedNeighborsAndDensities method computeSetsBounds.

/**
 * Create random projections, project points and put points into sets of size
 * about minSplitSize/2
 *
 * @param points points to process
 * @param minSplitSize minimum size for which a point set is further
 *        partitioned (roughly corresponds to minPts in OPTICS)
 * @param ptList Points that are to be projected
 */
public void computeSetsBounds(Relation<V> points, int minSplitSize, DBIDs ptList) {
    this.minSplitSize = minSplitSize;
    final int size = points.size();
    final int dim = RelationUtil.dimensionality(points);
    this.points = points;
    // perform O(log N+log dim) splits of the entire point sets projections
    int nPointSetSplits = (int) (logOProjectionConst * MathUtil.log2(size * dim + 1));
    // perform O(log N+log dim) projections of the point set onto a random line
    int nProject1d = (int) (logOProjectionConst * MathUtil.log2(size * dim + 1));
    LOG.statistics(new LongStatistic(PREFIX + ".partition-size", nPointSetSplits));
    LOG.statistics(new LongStatistic(PREFIX + ".num-projections", nProject1d));
    splitsets = new ArrayList<>();
    // perform projections of points
    projectedPoints = new DoubleDataStore[nProject1d];
    DoubleDataStore[] tmpPro = new DoubleDataStore[nProject1d];
    Random rand = rnd.getSingleThreadedRandom();
    FiniteProgress projp = LOG.isVerbose() ? new FiniteProgress("Random projections", nProject1d, LOG) : null;
    for (int j = 0; j < nProject1d; j++) {
        double[] currRp = new double[dim];
        double sum = 0;
        for (int i = 0; i < dim; i++) {
            double fl = rand.nextDouble() - 0.5;
            currRp[i] = fl;
            sum += fl * fl;
        }
        sum = FastMath.sqrt(sum);
        for (int i = 0; i < dim; i++) {
            currRp[i] /= sum;
        }
        WritableDoubleDataStore currPro = DataStoreUtil.makeDoubleStorage(ptList, DataStoreFactory.HINT_HOT);
        for (DBIDIter it = ptList.iter(); it.valid(); it.advance()) {
            NumberVector vecPt = points.get(it);
            // Dot product:
            double sum2 = 0;
            for (int i = 0; i < dim; i++) {
                sum2 += currRp[i] * vecPt.doubleValue(i);
            }
            currPro.put(it, sum2);
        }
        projectedPoints[j] = currPro;
        LOG.incrementProcessed(projp);
    }
    LOG.ensureCompleted(projp);
    // Log the number of scalar projections performed.
    long numprod = nProject1d * (long) ptList.size();
    LOG.statistics(new LongStatistic(PREFIX + ".num-scalar-products", numprod));
    // split entire point set, reuse projections by shuffling them
    IntArrayList proind = new IntArrayList(nProject1d);
    for (int j = 0; j < nProject1d; j++) {
        proind.add(j);
    }
    FiniteProgress splitp = LOG.isVerbose() ? new FiniteProgress("Splitting data", nPointSetSplits, LOG) : null;
    for (int avgP = 0; avgP < nPointSetSplits; avgP++) {
        // shuffle projections
        for (int i = 0; i < nProject1d; i++) {
            tmpPro[i] = projectedPoints[i];
        }
        // Shuffle axes (Fisher-Yates)
        for (int i = 1; i < nProject1d; i++) {
            final int j = rand.nextInt(i);
            // Swap i,j
            proind.set(i, proind.set(j, proind.getInt(i)));
        }
        IntIterator it = proind.iterator();
        int i = 0;
        while (it.hasNext()) {
            int cind = it.nextInt();
            projectedPoints[cind] = tmpPro[i];
            i++;
        }
        // split point set
        splitupNoSort(DBIDUtil.newArray(ptList), 0, size, 0, rand);
        LOG.incrementProcessed(splitp);
    }
    LOG.ensureCompleted(splitp);
}
Also used : IntIterator(it.unimi.dsi.fastutil.ints.IntIterator) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Random(java.util.Random) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) LongStatistic(de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList)

Example 68 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class ArffParserTest method dense.

@Test
public void dense() throws IOException {
    String filename = UNITTEST + "parsertest.arff";
    Parser parser = new ELKIBuilder<>(ArffParser.class).build();
    MultipleObjectsBundle bundle;
    try (InputStream is = open(filename);
        InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, null, parser)) {
        bundle = dbc.loadData();
    }
    // Ensure that the filter has correctly formed the bundle.
    // We expect that the bundle's first column is a number vector field.
    // We expect that the bundle's second column is a LabelList
    // Ensure the first column are the vectors.
    assertTrue("Test file not as expected", TypeUtil.NUMBER_VECTOR_FIELD.isAssignableFromType(bundle.meta(0)));
    assertTrue("Test file not as expected", TypeUtil.CLASSLABEL.isAssignableFromType(bundle.meta(1)));
    assertTrue("Test file not as expected", TypeUtil.LABELLIST.isAssignableFromType(bundle.meta(2)));
    assertTrue("Test file not as expected", TypeUtil.EXTERNALID.isAssignableFromType(bundle.meta(3)));
    assertEquals("Length", 11, bundle.dataLength());
    assertEquals("Length", 4, ((NumberVector) bundle.data(0, 0)).getDimensionality());
    // Dense missing values are supposed to be NaN
    NumberVector nv = (NumberVector) bundle.data(10, 0);
    assertTrue("Expected NaN for missing data", Double.isNaN(nv.doubleValue(1)));
    assertTrue("Expected NaN for missing data", Double.isNaN(nv.doubleValue(3)));
    // Ensure that the third column are the LabelList objects.
    assertEquals("Unexpected data type", DoubleVector.class, bundle.data(0, 0).getClass());
    assertEquals("Unexpected data type", SimpleClassLabel.class, bundle.data(0, 1).getClass());
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) InputStream(java.io.InputStream) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) InputStreamDatabaseConnection(de.lmu.ifi.dbs.elki.datasource.InputStreamDatabaseConnection) Test(org.junit.Test) AbstractDataSourceTest(de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)

Example 69 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class RelationUtil method relationAsMatrix.

/**
 * <em>Copy</em> a relation into a double matrix.
 *
 * This is <em>not recommended</em> unless you need to modify the data
 * temporarily.
 *
 * @param relation Relation
 * @param ids IDs, with well-defined order (i.e. array)
 * @return Data matrix
 */
public static double[][] relationAsMatrix(final Relation<? extends NumberVector> relation, ArrayDBIDs ids) {
    final int rowdim = ids.size();
    final int coldim = dimensionality(relation);
    double[][] mat = new double[rowdim][coldim];
    int r = 0;
    for (DBIDArrayIter iter = ids.iter(); iter.valid(); iter.advance(), r++) {
        NumberVector vec = relation.get(iter);
        double[] row = mat[r];
        for (int c = 0; c < coldim; c++) {
            row[c] = vec.doubleValue(c);
        }
    }
    assert (r == rowdim);
    return mat;
}
Also used : NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)

Example 70 with NumberVector

use of de.lmu.ifi.dbs.elki.data.NumberVector in project elki by elki-project.

the class WeightedCovarianceMatrixBuilder method processIds.

/**
 * Weighted Covariance Matrix for a set of IDs. Since we are not supplied any
 * distance information, we'll need to compute it ourselves. Covariance is
 * tied to Euclidean distance, so it probably does not make much sense to add
 * support for other distance functions?
 *
 * @param ids Database ids to process
 * @param relation Relation to process
 * @return Covariance matrix
 */
@Override
public double[][] processIds(DBIDs ids, Relation<? extends NumberVector> relation) {
    final int dim = RelationUtil.dimensionality(relation);
    final CovarianceMatrix cmat = new CovarianceMatrix(dim);
    final Centroid centroid = Centroid.make(relation, ids);
    // find maximum distance
    double maxdist = 0.0, stddev = 0.0;
    {
        for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
            double distance = weightDistance.distance(centroid, relation.get(iter));
            stddev += distance * distance;
            if (distance > maxdist) {
                maxdist = distance;
            }
        }
        if (maxdist == 0.0) {
            maxdist = 1.0;
        }
        // compute standard deviation.
        stddev = FastMath.sqrt(stddev / ids.size());
    }
    for (DBIDIter iter = ids.iter(); iter.valid(); iter.advance()) {
        NumberVector obj = relation.get(iter);
        double distance = weightDistance.distance(centroid, obj);
        double weight = weightfunction.getWeight(distance, maxdist, stddev);
        cmat.put(obj, weight);
    }
    return cmat.destroyToPopulationMatrix();
}
Also used : Centroid(de.lmu.ifi.dbs.elki.math.linearalgebra.Centroid) NumberVector(de.lmu.ifi.dbs.elki.data.NumberVector) CovarianceMatrix(de.lmu.ifi.dbs.elki.math.linearalgebra.CovarianceMatrix) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Aggregations

NumberVector (de.lmu.ifi.dbs.elki.data.NumberVector)85 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)40 ArrayList (java.util.ArrayList)16 LongStatistic (de.lmu.ifi.dbs.elki.logging.statistics.LongStatistic)9 DoubleVector (de.lmu.ifi.dbs.elki.data.DoubleVector)8 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)8 AbortException (de.lmu.ifi.dbs.elki.utilities.exceptions.AbortException)8 Database (de.lmu.ifi.dbs.elki.database.Database)7 DBIDs (de.lmu.ifi.dbs.elki.database.ids.DBIDs)7 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)7 Random (java.util.Random)7 Test (org.junit.Test)7 VectorFieldTypeInformation (de.lmu.ifi.dbs.elki.data.type.VectorFieldTypeInformation)5 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)5 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)5 EvaluationResult (de.lmu.ifi.dbs.elki.result.EvaluationResult)5 MeasurementGroup (de.lmu.ifi.dbs.elki.result.EvaluationResult.MeasurementGroup)5 List (java.util.List)5 SparseNumberVector (de.lmu.ifi.dbs.elki.data.SparseNumberVector)4 RandomProjectionFamily (de.lmu.ifi.dbs.elki.data.projection.random.RandomProjectionFamily)4