use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class P3C method run.
/**
* Performs the P3C algorithm on the given Database.
*/
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Overall progress.
StepProgress stepProgress = LOG.isVerbose() ? new StepProgress(8) : null;
if (stepProgress != null) {
stepProgress.beginStep(1, "Grid-partitioning data.", LOG);
}
// Desired number of bins, as per Sturge:
final int binCount = (int) Math.ceil(1 + MathUtil.log2(relation.size()));
// Perform 1-dimensional projections, and split into bins.
SetDBIDs[][] partitions = partitionData(relation, binCount);
if (stepProgress != null) {
stepProgress.beginStep(2, "Searching for non-uniform bins in support histograms.", LOG);
}
// Set markers for each attribute until they're all deemed uniform.
final long[][] markers = new long[dim][];
for (int d = 0; d < dim; d++) {
final SetDBIDs[] parts = partitions[d];
if (parts == null) {
// Never mark any on constant dimensions.
continue;
}
final long[] marked = markers[d] = BitsUtil.zero(binCount);
int card = 0;
while (card < dim - 1) {
// Find bin with largest support, test only the dimensions that were not
// previously marked.
int bestBin = chiSquaredUniformTest(parts, marked, card);
if (bestBin < 0) {
// Uniform
break;
}
BitsUtil.setI(marked, bestBin);
card++;
}
if (LOG.isDebugging()) {
LOG.debug("Marked bins in dim " + d + ": " + BitsUtil.toString(marked, binCount));
}
}
if (stepProgress != null) {
stepProgress.beginStep(3, "Merging marked bins to 1-signatures.", LOG);
}
ArrayList<Signature> signatures = constructOneSignatures(partitions, markers);
if (stepProgress != null) {
stepProgress.beginStep(4, "Computing cluster cores from merged p-signatures.", LOG);
}
ArrayList<Signature> clusterCores = mergeClusterCores(binCount, signatures);
if (stepProgress != null) {
stepProgress.beginStep(5, "Pruning redundant cluster cores.", LOG);
}
clusterCores = pruneRedundantClusterCores(clusterCores);
if (LOG.isVerbose()) {
LOG.verbose("Number of cluster cores found: " + clusterCores.size());
}
if (clusterCores.isEmpty()) {
LOG.setCompleted(stepProgress);
Clustering<SubspaceModel> c = new Clustering<>("P3C", "P3C");
c.addToplevelCluster(new Cluster<SubspaceModel>(relation.getDBIDs(), true));
return c;
}
if (stepProgress != null) {
stepProgress.beginStep(5, "Refining cluster cores to clusters via EM.", LOG);
}
// Track objects not assigned to any cluster:
ModifiableDBIDs noise = DBIDUtil.newHashSet();
WritableDataStore<double[]> probClusterIGivenX = DataStoreUtil.makeStorage(relation.getDBIDs(), DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_SORTED, double[].class);
int k = clusterCores.size();
List<MultivariateGaussianModel> models = new ArrayList<>(k);
computeFuzzyMembership(relation, clusterCores, noise, probClusterIGivenX, models, dim);
// Initial estimate of covariances, to assign noise objects
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
assignUnassigned(relation, probClusterIGivenX, models, noise);
double emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
for (int it = 1; it <= maxEmIterations || maxEmIterations < 0; it++) {
final double emOld = emNew;
EM.recomputeCovarianceMatrices(relation, probClusterIGivenX, models, 0.);
// reassign probabilities
emNew = EM.assignProbabilitiesToInstances(relation, models, probClusterIGivenX);
if (LOG.isVerbose()) {
LOG.verbose("iteration " + it + " - expectation value: " + emNew);
}
if ((emNew - emOld) <= emDelta) {
break;
}
}
if (stepProgress != null) {
stepProgress.beginStep(6, "Generating hard clustering.", LOG);
}
// Create a hard clustering, making sure each data point only is part of one
// cluster, based on the best match from the membership matrix.
ArrayList<ClusterCandidate> clusterCandidates = hardClustering(probClusterIGivenX, clusterCores, relation.getDBIDs());
if (stepProgress != null) {
stepProgress.beginStep(7, "Looking for outliers and moving them to the noise set.", LOG);
}
// Outlier detection. Remove points from clusters that have a Mahalanobis
// distance larger than the critical value of the ChiSquare distribution.
findOutliers(relation, models, clusterCandidates, noise);
if (stepProgress != null) {
stepProgress.beginStep(8, "Removing empty clusters.", LOG);
}
// Remove near-empty clusters.
for (Iterator<ClusterCandidate> it = clusterCandidates.iterator(); it.hasNext(); ) {
ClusterCandidate cand = it.next();
final int size = cand.ids.size();
if (size < minClusterSize) {
if (size > 0) {
noise.addDBIDs(cand.ids);
}
it.remove();
}
}
if (LOG.isVerbose()) {
LOG.verbose("Number of clusters remaining: " + clusterCandidates.size());
}
if (stepProgress != null) {
stepProgress.beginStep(9, "Generating final result.", LOG);
}
// Generate final output.
Clustering<SubspaceModel> result = new Clustering<>("P3C", "P3C");
for (int cluster = 0; cluster < clusterCandidates.size(); ++cluster) {
ClusterCandidate candidate = clusterCandidates.get(cluster);
CovarianceMatrix cvm = CovarianceMatrix.make(relation, candidate.ids);
result.addToplevelCluster(new Cluster<>(candidate.ids, new SubspaceModel(new Subspace(candidate.dimensions), cvm.getMeanVector())));
}
LOG.verbose("Noise size: " + noise.size());
if (noise.size() > 0) {
result.addToplevelCluster(new Cluster<SubspaceModel>(noise, true));
}
LOG.ensureCompleted(stepProgress);
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class DOC method run.
/**
* Performs the DOC or FastDOC (as configured) algorithm on the given
* Database.
*
* This will run exhaustively, i.e. run DOC until no clusters are found
* anymore / the database size has shrunk below the threshold for minimum
* cluster size.
*
* @param database Database
* @param relation Data relation
*/
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
// Dimensionality of our set.
final int d = RelationUtil.dimensionality(relation);
// Get available DBIDs as a set we can remove items from.
ArrayModifiableDBIDs S = DBIDUtil.newArray(relation.getDBIDs());
// Precompute values as described in Figure 2.
double r = Math.abs(FastMath.log(d + d) / FastMath.log(beta * .5));
// Outer loop count.
int n = (int) (2. / alpha);
// Inner loop count.
int m = (int) (FastMath.pow(2. / alpha, r) * FastMath.log(4));
// TODO: This should only apply for FastDOC.
m = Math.min(m, Math.min(1000000, d * d));
// Minimum size for a cluster for it to be accepted.
int minClusterSize = (int) (alpha * S.size());
// List of all clusters we found.
Clustering<SubspaceModel> result = new Clustering<>("DOC Clusters", "DOC");
// Inform the user about the number of actual clusters found so far.
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Number of clusters", LOG) : null;
// of points is empty.
while (S.size() > minClusterSize) {
Cluster<SubspaceModel> C = runDOC(database, relation, S, d, n, m, (int) r, minClusterSize);
if (C == null) {
// Stop trying if we couldn't find a cluster.
break;
}
// Found a cluster, remember it, remove its points from the set.
result.addToplevelCluster(C);
// Remove all points of the cluster from the set and continue.
S.removeDBIDs(C.getIDs());
if (cprogress != null) {
cprogress.setProcessed(result.getAllClusters().size(), LOG);
}
}
// Add the remainder as noise.
if (S.size() > 0) {
long[] alldims = BitsUtil.ones(d);
result.addToplevelCluster(new Cluster<>(S, true, new SubspaceModel(new Subspace(alldims), Centroid.make(relation, S).getArrayRef())));
}
LOG.setCompleted(cprogress);
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class PROCLUS method run.
/**
* Performs the PROCLUS algorithm on the given database.
*
* @param database Database to process
* @param relation Relation to process
*/
public Clustering<SubspaceModel> run(Database database, Relation<V> relation) {
if (RelationUtil.dimensionality(relation) < l) {
throw new IllegalStateException("Dimensionality of data < parameter l! (" + RelationUtil.dimensionality(relation) + " < " + l + ")");
}
DistanceQuery<V> distFunc = database.getDistanceQuery(relation, SquaredEuclideanDistanceFunction.STATIC);
RangeQuery<V> rangeQuery = database.getRangeQuery(distFunc);
final Random random = rnd.getSingleThreadedRandom();
// initialization phase
if (LOG.isVerbose()) {
LOG.verbose("1. Initialization phase...");
}
int sampleSize = Math.min(relation.size(), k_i * k);
DBIDs sampleSet = DBIDUtil.randomSample(relation.getDBIDs(), sampleSize, random);
int medoidSize = Math.min(relation.size(), m_i * k);
ArrayDBIDs medoids = greedy(distFunc, sampleSet, medoidSize, random);
if (LOG.isDebugging()) {
LOG.debugFine(//
new StringBuilder().append("sampleSize ").append(sampleSize).append('\n').append("sampleSet ").append(sampleSet).append(//
'\n').append("medoidSize ").append(medoidSize).append(//
'\n').append("m ").append(medoids).toString());
}
// iterative phase
if (LOG.isVerbose()) {
LOG.verbose("2. Iterative phase...");
}
double bestObjective = Double.POSITIVE_INFINITY;
ArrayDBIDs m_best = null;
DBIDs m_bad = null;
ArrayDBIDs m_current = initialSet(medoids, k, random);
if (LOG.isDebugging()) {
LOG.debugFine(new StringBuilder().append("m_c ").append(m_current).toString());
}
IndefiniteProgress cprogress = LOG.isVerbose() ? new IndefiniteProgress("Current number of clusters:", LOG) : null;
ArrayList<PROCLUSCluster> clusters = null;
int loops = 0;
while (loops < 10) {
long[][] dimensions = findDimensions(m_current, relation, distFunc, rangeQuery);
clusters = assignPoints(m_current, dimensions, relation);
double objectiveFunction = evaluateClusters(clusters, dimensions, relation);
if (objectiveFunction < bestObjective) {
// restart counting loops
loops = 0;
bestObjective = objectiveFunction;
m_best = m_current;
m_bad = computeBadMedoids(m_current, clusters, (int) (relation.size() * 0.1 / k));
}
m_current = computeM_current(medoids, m_best, m_bad, random);
loops++;
if (cprogress != null) {
cprogress.setProcessed(clusters.size(), LOG);
}
}
LOG.setCompleted(cprogress);
// refinement phase
if (LOG.isVerbose()) {
LOG.verbose("3. Refinement phase...");
}
List<Pair<double[], long[]>> dimensions = findDimensions(clusters, relation);
List<PROCLUSCluster> finalClusters = finalAssignment(dimensions, relation);
// build result
int numClusters = 1;
Clustering<SubspaceModel> result = new Clustering<>("ProClus clustering", "proclus-clustering");
for (PROCLUSCluster c : finalClusters) {
Cluster<SubspaceModel> cluster = new Cluster<>(c.objectIDs);
cluster.setModel(new SubspaceModel(new Subspace(c.getDimensions()), c.centroid));
cluster.setName("cluster_" + numClusters++);
result.addToplevelCluster(cluster);
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class CLIQUE method run.
/**
* Performs the CLIQUE algorithm on the given database.
*
* @param relation Data relation to process
* @return Clustering result
*/
public Clustering<SubspaceModel> run(Relation<V> relation) {
final int dimensionality = RelationUtil.dimensionality(relation);
StepProgress step = new StepProgress(2);
// 1. Identification of subspaces that contain clusters
step.beginStep(1, "Identification of subspaces that contain clusters", LOG);
ArrayList<List<CLIQUESubspace<V>>> dimensionToDenseSubspaces = new ArrayList<>(dimensionality);
List<CLIQUESubspace<V>> denseSubspaces = findOneDimensionalDenseSubspaces(relation);
dimensionToDenseSubspaces.add(denseSubspaces);
if (LOG.isVerbose()) {
LOG.verbose("1-dimensional dense subspaces: " + denseSubspaces.size());
}
if (LOG.isDebugging()) {
for (CLIQUESubspace<V> s : denseSubspaces) {
LOG.debug(s.toString(" "));
}
}
for (int k = 2; k <= dimensionality && !denseSubspaces.isEmpty(); k++) {
denseSubspaces = findDenseSubspaces(relation, denseSubspaces);
assert (dimensionToDenseSubspaces.size() == k - 1);
dimensionToDenseSubspaces.add(denseSubspaces);
if (LOG.isVerbose()) {
LOG.verbose(k + "-dimensional dense subspaces: " + denseSubspaces.size());
}
if (LOG.isDebugging()) {
for (CLIQUESubspace<V> s : denseSubspaces) {
LOG.debug(s.toString(" "));
}
}
}
// 2. Identification of clusters
step.beginStep(2, "Identification of clusters", LOG);
// build result
Clustering<SubspaceModel> result = new Clustering<>("CLIQUE clustering", "clique-clustering");
for (int dim = 0; dim < dimensionToDenseSubspaces.size(); dim++) {
List<CLIQUESubspace<V>> subspaces = dimensionToDenseSubspaces.get(dim);
List<Pair<Subspace, ModifiableDBIDs>> modelsAndClusters = determineClusters(subspaces);
if (LOG.isVerbose()) {
LOG.verbose((dim + 1) + "-dimensional clusters: " + modelsAndClusters.size());
}
for (Pair<Subspace, ModifiableDBIDs> modelAndCluster : modelsAndClusters) {
Cluster<SubspaceModel> newCluster = new Cluster<>(modelAndCluster.second);
newCluster.setModel(new SubspaceModel(modelAndCluster.first, Centroid.make(relation, modelAndCluster.second).getArrayRef()));
result.addToplevelCluster(newCluster);
}
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class KMeansBatchedLloyd method run.
@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
final int dim = RelationUtil.dimensionality(relation);
// Choose initial means
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initializer", initializer.toString()));
}
double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
ArrayDBIDs[] parts = DBIDUtil.randomSplit(relation.getDBIDs(), blocks, random);
double[][] meanshift = new double[k][dim];
int[] changesize = new int[k];
double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
int iteration = 0;
for (; maxiter <= 0 || iteration < maxiter; iteration++) {
LOG.incrementProcessed(prog);
boolean changed = false;
FiniteProgress pprog = LOG.isVerbose() ? new FiniteProgress("Batch", parts.length, LOG) : null;
for (int p = 0; p < parts.length; p++) {
// Initialize new means scratch space.
for (int i = 0; i < k; i++) {
Arrays.fill(meanshift[i], 0.);
}
Arrays.fill(changesize, 0);
Arrays.fill(varsum, 0.);
changed |= assignToNearestCluster(relation, parts[p], means, meanshift, changesize, clusters, assignment, varsum);
// Recompute means.
updateMeans(means, meanshift, clusters, changesize);
LOG.incrementProcessed(pprog);
}
LOG.ensureCompleted(pprog);
logVarstat(varstat, varsum);
// Stop if no cluster assignment changed.
if (!changed) {
break;
}
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for (int i = 0; i < clusters.size(); i++) {
DBIDs ids = clusters.get(i);
if (ids.size() == 0) {
continue;
}
KMeansModel model = new KMeansModel(means[i], varsum[i]);
result.addToplevelCluster(new Cluster<>(ids, model));
}
return result;
}
Aggregations