Search in sources :

Example 1 with IntegerArray

use of de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray in project elki by elki-project.

the class IntegerRankTieNormalizationTest method defaultParameters.

/**
 * Test with default parameters.
 */
@Test
public void defaultParameters() {
    String filename = UNITTEST + "normalization-test-1.csv";
    IntegerRankTieNormalization filter = new ELKIBuilder<>(IntegerRankTieNormalization.class).build();
    MultipleObjectsBundle bundle = readBundle(filename, filter);
    int dim = getFieldDimensionality(bundle, 0, TypeUtil.NUMBER_VECTOR_FIELD);
    IntegerArray coldata = new IntegerArray(bundle.dataLength());
    for (int col = 0; col < dim; col++) {
        coldata.clear();
        // Extract the column:
        for (int row = 0; row < bundle.dataLength(); row++) {
            IntegerVector obj = get(bundle, row, 0, IntegerVector.class);
            coldata.add(obj.intValue(col));
        }
        // Sort values:
        coldata.sort();
        // Verify that the gap matches the frequency of each value.
        final int size = coldata.size;
        assertEquals("First value", coldata.get(0), coldata.get(coldata.get(0)));
        for (int i = 0; i < size; ) {
            // s: Start, i: end, v: value, f: frequency
            int s = i, v = coldata.get(i), f = 1;
            while (++i < size && v == coldata.get(i)) {
                f++;
            }
            // Only iff the frequencies is even, the values will be odd.
            assertNotSame("Even/odd rule", (f & 1), (v & 1));
            assertEquals("Bad value at position " + s, s + i - 1, v);
            assertEquals("Bad frequency at position " + s, i - s, f);
        }
    }
}
Also used : IntegerVector(de.lmu.ifi.dbs.elki.data.IntegerVector) MultipleObjectsBundle(de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle) IntegerArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray) AbstractDataSourceTest(de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest) Test(org.junit.Test)

Example 2 with IntegerArray

use of de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray in project elki by elki-project.

the class NNChain method nnChainCore.

/**
 * Uses NNChain as in "Modern hierarchical, agglomerative clustering
 * algorithms" by Daniel Müllner
 *
 * @param mat Matrix view
 * @param builder Result builder
 */
private void nnChainCore(MatrixParadigm mat, PointerHierarchyRepresentationBuilder builder) {
    final DBIDArrayIter ix = mat.ix;
    final double[] distances = mat.matrix;
    final int size = mat.size;
    // The maximum chain size = number of ids + 1
    IntegerArray chain = new IntegerArray(size + 1);
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Running NNChain", size - 1, LOG) : null;
    for (int k = 1, end = size; k < size; k++) {
        int a = -1, b = -1;
        if (chain.size() <= 3) {
            // Accessing two arbitrary not yet merged elements could be optimized to
            // work in O(1) like in Müllner;
            // however this usually does not have a huge impact (empirically just
            // about 1/5000 of total performance)
            a = findUnlinked(0, end, ix, builder);
            b = findUnlinked(a + 1, end, ix, builder);
            chain.clear();
            chain.add(a);
        } else {
            // Chain is expected to look like (.... a, b, c, b) with b and c merged.
            int lastIndex = chain.size;
            int c = chain.get(lastIndex - 2);
            b = chain.get(lastIndex - 3);
            a = chain.get(lastIndex - 4);
            // Ensure we had a loop at the end:
            assert (chain.get(lastIndex - 1) == c || chain.get(lastIndex - 1) == b);
            // if c < b, then we merged b -> c, otherwise c -> b
            b = c < b ? c : b;
            // Cut the tail:
            chain.size -= 3;
        }
        // For ties, always prefer the second-last element b:
        double minDist = mat.get(a, b);
        do {
            int c = b;
            final int ta = MatrixParadigm.triangleSize(a);
            for (int i = 0; i < a; i++) {
                if (i != b && !builder.isLinked(ix.seek(i))) {
                    double dist = distances[ta + i];
                    if (dist < minDist) {
                        minDist = dist;
                        c = i;
                    }
                }
            }
            for (int i = a + 1; i < size; i++) {
                if (i != b && !builder.isLinked(ix.seek(i))) {
                    double dist = distances[MatrixParadigm.triangleSize(i) + a];
                    if (dist < minDist) {
                        minDist = dist;
                        c = i;
                    }
                }
            }
            b = a;
            a = c;
            chain.add(a);
        } while (chain.size() < 3 || a != chain.get(chain.size - 1 - 2));
        // We always merge the larger into the smaller index:
        if (a < b) {
            int tmp = a;
            a = b;
            b = tmp;
        }
        assert (minDist == mat.get(a, b));
        assert (b < a);
        merge(size, mat, builder, minDist, a, b);
        // Shrink working set
        end = AGNES.shrinkActiveSet(ix, builder, end, a);
        LOG.incrementProcessed(progress);
    }
    LOG.ensureCompleted(progress);
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) DBIDArrayIter(de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter) IntegerArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray)

Example 3 with IntegerArray

use of de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray in project elki by elki-project.

the class NearestNeighborAffinityMatrixBuilder method computePij.

/**
 * Compute the sparse pij using the nearest neighbors only.
 *
 * @param ids ID range
 * @param knnq kNN query
 * @param square Use squared distances
 * @param numberOfNeighbours Number of neighbors to get
 * @param pij Output of distances
 * @param indices Output of indexes
 * @param initialScale Initial scaling factor
 */
protected void computePij(DBIDRange ids, KNNQuery<?> knnq, boolean square, int numberOfNeighbours, double[][] pij, int[][] indices, double initialScale) {
    Duration timer = LOG.isStatistics() ? LOG.newDuration(this.getClass().getName() + ".runtime.neighborspijmatrix").begin() : null;
    final double logPerp = FastMath.log(perplexity);
    // Scratch arrays, resizable
    DoubleArray dists = new DoubleArray(numberOfNeighbours + 10);
    IntegerArray inds = new IntegerArray(numberOfNeighbours + 10);
    // Compute nearest-neighbor sparse affinity matrix
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Finding neighbors and optimizing perplexity", ids.size(), LOG) : null;
    MeanVariance mv = LOG.isStatistics() ? new MeanVariance() : null;
    for (DBIDArrayIter ix = ids.iter(); ix.valid(); ix.advance()) {
        dists.clear();
        inds.clear();
        KNNList neighbours = knnq.getKNNForDBID(ix, numberOfNeighbours + 1);
        convertNeighbors(ids, ix, square, neighbours, dists, inds);
        double beta = computeSigma(// 
        ix.getOffset(), // 
        dists, // 
        perplexity, // 
        logPerp, pij[ix.getOffset()] = new double[dists.size()]);
        if (mv != null) {
            // Sigma
            mv.put(beta > 0 ? FastMath.sqrt(.5 / beta) : 0.);
        }
        indices[ix.getOffset()] = inds.toArray();
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    // Sum of the sparse affinity matrix:
    double sum = 0.;
    for (int i = 0; i < pij.length; i++) {
        final double[] pij_i = pij[i];
        for (int j = 0; j < pij_i.length; j++) {
            sum += pij_i[j];
        }
    }
    final double scale = initialScale / (2 * sum);
    for (int i = 0; i < pij.length; i++) {
        final double[] pij_i = pij[i];
        for (int offi = 0; offi < pij_i.length; offi++) {
            int j = indices[i][offi];
            assert (i != j);
            int offj = containsIndex(indices[j], i);
            if (offj >= 0) {
                // Found
                assert (indices[j][offj] == i);
                // Exploit symmetry:
                if (i < j) {
                    // Symmetrize
                    final double val = pij_i[offi] + pij[j][offj];
                    pij_i[offi] = pij[j][offj] = MathUtil.max(val * scale, MIN_PIJ);
                }
            } else {
                // Not found
                // TODO: the original code produces a symmetric matrix
                // And it will now not sum to EARLY_EXAGGERATION anymore.
                pij_i[offi] = MathUtil.max(pij_i[offi] * scale, MIN_PIJ);
            }
        }
    }
    if (LOG.isStatistics()) {
        // timer != null, mv != null
        LOG.statistics(timer.end());
        LOG.statistics(new DoubleStatistic(NearestNeighborAffinityMatrixBuilder.class.getName() + ".sigma.average", mv.getMean()));
        LOG.statistics(new DoubleStatistic(NearestNeighborAffinityMatrixBuilder.class.getName() + ".sigma.stddev", mv.getSampleStddev()));
    }
}
Also used : DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) DoubleArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.DoubleArray) IntegerArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray)

Example 4 with IntegerArray

use of de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray in project elki by elki-project.

the class IntrinsicNearestNeighborAffinityMatrixBuilder method computePij.

/**
 * Compute the sparse pij using the nearest neighbors only.
 *
 * @param ids ID range
 * @param knnq kNN query
 * @param square Use squared distances
 * @param numberOfNeighbours Number of neighbors to get
 * @param pij Output of distances
 * @param indices Output of indexes
 * @param initialScale Initial scaling factor
 */
protected void computePij(DBIDRange ids, KNNQuery<?> knnq, boolean square, int numberOfNeighbours, double[][] pij, int[][] indices, double initialScale) {
    Duration timer = LOG.isStatistics() ? LOG.newDuration(this.getClass().getName() + ".runtime.neighborspijmatrix").begin() : null;
    final double logPerp = FastMath.log(perplexity);
    // Scratch arrays, resizable
    DoubleArray dists = new DoubleArray(numberOfNeighbours + 10);
    IntegerArray inds = new IntegerArray(numberOfNeighbours + 10);
    // Compute nearest-neighbor sparse affinity matrix
    FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Finding neighbors and optimizing perplexity", ids.size(), LOG) : null;
    MeanVariance mv = LOG.isStatistics() ? new MeanVariance() : null;
    Mean mid = LOG.isStatistics() ? new Mean() : null;
    for (DBIDArrayIter ix = ids.iter(); ix.valid(); ix.advance()) {
        dists.clear();
        inds.clear();
        KNNList neighbours = knnq.getKNNForDBID(ix, numberOfNeighbours + 1);
        convertNeighbors(ids, ix, square, neighbours, dists, inds, mid);
        double beta = computeSigma(// 
        ix.getOffset(), // 
        dists, // 
        perplexity, // 
        logPerp, pij[ix.getOffset()] = new double[dists.size()]);
        if (mv != null) {
            // Sigma
            mv.put(beta > 0 ? FastMath.sqrt(.5 / beta) : 0.);
        }
        indices[ix.getOffset()] = inds.toArray();
        LOG.incrementProcessed(prog);
    }
    LOG.ensureCompleted(prog);
    if (mid != null) {
        LOG.statistics(new DoubleStatistic(getClass() + ".average-original-id", mid.getMean()));
    }
    // Sum of the sparse affinity matrix:
    double sum = 0.;
    for (int i = 0; i < pij.length; i++) {
        final double[] pij_i = pij[i];
        for (int offi = 0; offi < pij_i.length; offi++) {
            int j = indices[i][offi];
            if (j > i) {
                // Exploit symmetry.
                continue;
            }
            assert (i != j);
            int offj = containsIndex(indices[j], i);
            if (offj >= 0) {
                // Found
                sum += FastMath.sqrt(pij_i[offi] * pij[j][offj]);
            }
        }
    }
    final double scale = initialScale / (2 * sum);
    for (int i = 0; i < pij.length; i++) {
        final double[] pij_i = pij[i];
        for (int offi = 0; offi < pij_i.length; offi++) {
            int j = indices[i][offi];
            assert (i != j);
            int offj = containsIndex(indices[j], i);
            if (offj >= 0) {
                // Found
                assert (indices[j][offj] == i);
                // Exploit symmetry:
                if (i < j) {
                    // Symmetrize
                    final double val = FastMath.sqrt(pij_i[offi] * pij[j][offj]);
                    pij_i[offi] = pij[j][offj] = MathUtil.max(val * scale, MIN_PIJ);
                }
            } else {
                // Not found, so zero.
                pij_i[offi] = 0;
            }
        }
    }
    if (LOG.isStatistics()) {
        // timer != null, mv != null
        LOG.statistics(timer.end());
        LOG.statistics(new DoubleStatistic(NearestNeighborAffinityMatrixBuilder.class.getName() + ".sigma.average", mv.getMean()));
        LOG.statistics(new DoubleStatistic(NearestNeighborAffinityMatrixBuilder.class.getName() + ".sigma.stddev", mv.getSampleStddev()));
    }
}
Also used : Mean(de.lmu.ifi.dbs.elki.math.Mean) FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) Duration(de.lmu.ifi.dbs.elki.logging.statistics.Duration) IntegerArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray) DoubleStatistic(de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic) MeanVariance(de.lmu.ifi.dbs.elki.math.MeanVariance) DoubleArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.DoubleArray)

Example 5 with IntegerArray

use of de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray in project elki by elki-project.

the class MiniMaxNNChain method nnChainCore.

/**
 * Uses NNChain as in "Modern hierarchical, agglomerative clustering
 * algorithms" by Daniel Müllner
 *
 * @param mat distance matrix
 * @param prots computed prototypes
 * @param dq distance query of the data set
 * @param builder Result builder
 * @param clusters current clusters
 */
private void nnChainCore(MatrixParadigm mat, DBIDArrayMIter prots, DistanceQuery<O> dq, PointerHierarchyRepresentationBuilder builder, Int2ObjectOpenHashMap<ModifiableDBIDs> clusters) {
    final DBIDArrayIter ix = mat.ix;
    final double[] distances = mat.matrix;
    final int size = mat.size;
    // The maximum chain size = number of ids + 1
    IntegerArray chain = new IntegerArray(size + 1);
    FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("Running MiniMax-NNChain", size - 1, LOG) : null;
    for (int k = 1, end = size; k < size; k++) {
        int a = -1, b = -1;
        if (chain.size() <= 3) {
            // Accessing two arbitrary not yet merged elements could be optimized to
            // work in O(1) like in Müllner;
            // however this usually does not have a huge impact (empirically just
            // about 1/5000 of total performance)
            a = NNChain.findUnlinked(0, end, ix, builder);
            b = NNChain.findUnlinked(a + 1, end, ix, builder);
            chain.clear();
            chain.add(a);
        } else {
            // Chain is expected to look like (.... a, b, c, b) with b and c merged.
            int lastIndex = chain.size;
            int c = chain.get(lastIndex - 2);
            b = chain.get(lastIndex - 3);
            a = chain.get(lastIndex - 4);
            // Ensure we had a loop at the end:
            assert (chain.get(lastIndex - 1) == c || chain.get(lastIndex - 1) == b);
            // if c < b, then we merged b -> c, otherwise c -> b
            b = c < b ? c : b;
            // Cut the tail:
            chain.size -= 3;
        }
        // For ties, always prefer the second-last element b:
        double minDist = mat.get(a, b);
        do {
            int c = b;
            final int ta = MatrixParadigm.triangleSize(a);
            for (int i = 0; i < a; i++) {
                if (i != b && !builder.isLinked(ix.seek(i))) {
                    double dist = distances[ta + i];
                    if (dist < minDist) {
                        minDist = dist;
                        c = i;
                    }
                }
            }
            for (int i = a + 1; i < size; i++) {
                if (i != b && !builder.isLinked(ix.seek(i))) {
                    double dist = distances[MatrixParadigm.triangleSize(i) + a];
                    if (dist < minDist) {
                        minDist = dist;
                        c = i;
                    }
                }
            }
            b = a;
            a = c;
            chain.add(a);
        } while (chain.size() < 3 || a != chain.get(chain.size - 1 - 2));
        // We always merge the larger into the smaller index:
        if (a < b) {
            int tmp = a;
            a = b;
            b = tmp;
        }
        assert (minDist == mat.get(a, b));
        assert (b < a);
        MiniMax.merge(size, mat, prots, builder, clusters, dq, a, b);
        // Shrink working set
        end = AGNES.shrinkActiveSet(ix, builder, end, a);
        LOG.incrementProcessed(progress);
    }
    LOG.ensureCompleted(progress);
}
Also used : FiniteProgress(de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress) IntegerArray(de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray)

Aggregations

IntegerArray (de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.IntegerArray)5 FiniteProgress (de.lmu.ifi.dbs.elki.logging.progress.FiniteProgress)4 DoubleStatistic (de.lmu.ifi.dbs.elki.logging.statistics.DoubleStatistic)2 Duration (de.lmu.ifi.dbs.elki.logging.statistics.Duration)2 MeanVariance (de.lmu.ifi.dbs.elki.math.MeanVariance)2 DoubleArray (de.lmu.ifi.dbs.elki.utilities.datastructures.arraylike.DoubleArray)2 IntegerVector (de.lmu.ifi.dbs.elki.data.IntegerVector)1 DBIDArrayIter (de.lmu.ifi.dbs.elki.database.ids.DBIDArrayIter)1 AbstractDataSourceTest (de.lmu.ifi.dbs.elki.datasource.AbstractDataSourceTest)1 MultipleObjectsBundle (de.lmu.ifi.dbs.elki.datasource.bundle.MultipleObjectsBundle)1 Mean (de.lmu.ifi.dbs.elki.math.Mean)1 Test (org.junit.Test)1