use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class KMeansLloyd method run.
@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// Setup cluster assignment store
List<ModifiableDBIDs> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newHashSet((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
DoubleStatistic varstat = LOG.isStatistics() ? new DoubleStatistic(this.getClass().getName() + ".variance-sum") : null;
int iteration = 0;
for (; maxiter <= 0 || iteration < maxiter; iteration++) {
LOG.incrementProcessed(prog);
boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum);
logVarstat(varstat, varsum);
// Stop if no cluster assignment changed.
if (!changed) {
break;
}
// Recompute means.
means = means(clusters, means, relation);
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for (int i = 0; i < clusters.size(); i++) {
DBIDs ids = clusters.get(i);
if (ids.size() == 0) {
continue;
}
KMeansModel model = new KMeansModel(means[i], varsum[i]);
result.addToplevelCluster(new Cluster<>(ids, model));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class KMeansMinusMinus method run.
@Override
public Clustering<KMeansModel> run(Database database, Relation<V> relation) {
if (relation.size() <= 0) {
return new Clustering<>("k-Means Clustering", "kmeans-clustering");
}
// Choose initial means
if (LOG.isStatistics()) {
LOG.statistics(new StringStatistic(KEY + ".initialization", initializer.toString()));
}
// Intialisieren der means
double[][] means = initializer.chooseInitialMeans(database, relation, k, getDistanceFunction());
// initialisieren vom Heap
final int heapsize = (int) (rate < 1. ? Math.ceil(relation.size() * rate) : rate);
DoubleMinHeap minHeap = new DoubleMinHeap(heapsize);
// Setup cluster assignment store
List<ModifiableDoubleDBIDList> clusters = new ArrayList<>();
for (int i = 0; i < k; i++) {
clusters.add(DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
}
WritableIntegerDataStore assignment = DataStoreUtil.makeIntegerStorage(relation.getDBIDs(), DataStoreFactory.HINT_TEMP | DataStoreFactory.HINT_HOT, -1);
double[] varsum = new double[k];
IndefiniteProgress prog = LOG.isVerbose() ? new IndefiniteProgress("K-Means iteration", LOG) : null;
DoubleStatistic varstat = new DoubleStatistic(this.getClass().getName() + ".variance-sum");
// Otherwise, the vartotal break below will fail!
assert (varstat != null);
int iteration = 0;
double prevvartotal = Double.POSITIVE_INFINITY;
for (; maxiter <= 0 || iteration < maxiter; iteration++) {
minHeap.clear();
for (int i = 0; i < k; i++) {
clusters.get(i).clear();
}
LOG.incrementProcessed(prog);
boolean changed = assignToNearestCluster(relation, means, clusters, assignment, varsum, minHeap, heapsize);
double vartotal = logVarstat(varstat, varsum);
// than the previous value.
if (!changed || vartotal > prevvartotal) {
break;
}
prevvartotal = vartotal;
// Recompute means.
means = meansWithTreshhold(clusters, means, relation, heapsize > 0 ? minHeap.peek() : Double.POSITIVE_INFINITY);
}
// create noisecluster if wanted
ModifiableDoubleDBIDList noiseids = null;
if (noiseFlag && heapsize > 0) {
clusters.add(noiseids = DBIDUtil.newDistanceDBIDList((int) (relation.size() * 2. / k)));
double tresh = minHeap.peek();
for (int i = 0; i < k; i++) {
for (DoubleDBIDListMIter it = clusters.get(i).iter(); it.valid(); it.advance()) {
final double dist = it.doubleValue();
// Add to the noise cluster:
if (dist >= tresh) {
noiseids.add(dist, it);
assignment.putInt(it, k);
it.remove();
}
}
}
}
LOG.setCompleted(prog);
if (LOG.isStatistics()) {
LOG.statistics(new LongStatistic(KEY + ".iterations", iteration));
}
// Wrap result
Clustering<KMeansModel> result = new Clustering<>("k-Means Clustering", "kmeans-clustering");
for (int i = 0; i < k; i++) {
DBIDs ids = clusters.get(i);
if (ids.size() == 0) {
continue;
}
KMeansModel model = new KMeansModel(means[i], varsum[i]);
result.addToplevelCluster(new Cluster<>(ids, model));
}
// Noise Cluster
if (noiseFlag) {
KMeansModel model = new KMeansModel(null, 0);
DBIDs ids = noiseids;
if (ids.size() == 0) {
return result;
}
result.addToplevelCluster(new Cluster<>(ids, true, model));
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class XMeans method splitCluster.
/**
* Conditionally splits the clusters based on the information criterion.
*
* @param parentCluster Cluster to split
* @param database Database
* @param relation Data relation
* @return Parent cluster when split decreases clustering quality or child
* clusters when split improves clustering.
*/
protected List<Cluster<M>> splitCluster(Cluster<M> parentCluster, Database database, Relation<V> relation) {
// Transform parent cluster into a clustering
ArrayList<Cluster<M>> parentClusterList = new ArrayList<Cluster<M>>(1);
parentClusterList.add(parentCluster);
Clustering<M> parentClustering = new Clustering<>(parentCluster.getName(), parentCluster.getName(), parentClusterList);
if (parentCluster.size() < 2) {
// Split is not possbile
return parentClusterList;
}
ProxyDatabase proxyDB = new ProxyDatabase(parentCluster.getIDs(), database);
splitInitializer.setInitialMeans(splitCentroid(parentCluster, relation));
innerKMeans.setK(2);
Clustering<M> childClustering = innerKMeans.run(proxyDB);
double parentEvaluation = informationCriterion.quality(parentClustering, getDistanceFunction(), relation);
double childrenEvaluation = informationCriterion.quality(childClustering, getDistanceFunction(), relation);
if (LOG.isDebugging()) {
LOG.debug("parentEvaluation: " + parentEvaluation);
LOG.debug("childrenEvaluation: " + childrenEvaluation);
}
// Check if split is an improvement:
return (childrenEvaluation > parentEvaluation) ^ informationCriterion.ascending() ? parentClusterList : childClustering.getAllClusters();
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class KNNKernelDensityMinimaClustering method run.
/**
* Run the clustering algorithm on a data relation.
*
* @param relation Relation
* @return Clustering result
*/
public Clustering<ClusterModel> run(Relation<V> relation) {
ArrayModifiableDBIDs ids = DBIDUtil.newArray(relation.getDBIDs());
final int size = ids.size();
// Sort by the sole dimension
ids.sort(new VectorUtil.SortDBIDsBySingleDimension(relation, dim));
// Density storage.
WritableDoubleDataStore density = DataStoreUtil.makeDoubleStorage(ids, DataStoreFactory.HINT_HOT | DataStoreFactory.HINT_TEMP, 0.);
DBIDArrayIter iter = ids.iter(), iter2 = ids.iter();
StepProgress sprog = LOG.isVerbose() ? new StepProgress("Clustering steps", 2) : null;
LOG.beginStep(sprog, 1, "Kernel density estimation.");
{
double[] scratch = new double[2 * k];
iter.seek(0);
for (int i = 0; i < size; i++, iter.advance()) {
// Current value.
final double curv = relation.get(iter).doubleValue(dim);
final int pre = Math.max(i - k, 0), prek = i - pre;
final int pos = Math.min(i + k, size - 1), posk = pos - i;
iter2.seek(pre);
for (int j = 0; j < prek; j++, iter2.advance()) {
scratch[j] = curv - relation.get(iter2).doubleValue(dim);
}
assert (iter2.getOffset() == i);
iter2.advance();
for (int j = 0; j < posk; j++, iter2.advance()) {
scratch[prek + j] = relation.get(iter2).doubleValue(dim) - curv;
}
assert (prek + posk >= k);
double kdist = QuickSelect.quickSelect(scratch, 0, prek + posk, k);
switch(mode) {
case BALLOON:
{
double dens = 0.;
if (kdist > 0.) {
for (int j = 0; j < prek + posk; j++) {
dens += kernel.density(scratch[j] / kdist);
}
} else {
dens = Double.POSITIVE_INFINITY;
}
assert (iter.getOffset() == i);
density.putDouble(iter, dens);
break;
}
case SAMPLE:
{
if (kdist > 0.) {
iter2.seek(pre);
for (int j = 0; j < prek; j++, iter2.advance()) {
double delta = curv - relation.get(iter2).doubleValue(dim);
density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist));
}
assert (iter2.getOffset() == i);
iter2.advance();
for (int j = 0; j < posk; j++, iter2.advance()) {
double delta = relation.get(iter2).doubleValue(dim) - curv;
density.putDouble(iter2, density.doubleValue(iter2) + kernel.density(delta / kdist));
}
} else {
iter2.seek(pre);
for (int j = 0; j < prek; j++, iter2.advance()) {
double delta = curv - relation.get(iter2).doubleValue(dim);
if (!(delta > 0.)) {
density.putDouble(iter2, Double.POSITIVE_INFINITY);
}
}
assert (iter2.getOffset() == i);
iter2.advance();
for (int j = 0; j < posk; j++, iter2.advance()) {
double delta = relation.get(iter2).doubleValue(dim) - curv;
if (!(delta > 0.)) {
density.putDouble(iter2, Double.POSITIVE_INFINITY);
}
}
}
break;
}
default:
throw new UnsupportedOperationException("Unknown mode specified.");
}
}
}
LOG.beginStep(sprog, 2, "Local minima detection.");
Clustering<ClusterModel> clustering = new Clustering<>("onedimensional-kde-clustering", "One-Dimensional clustering using kernel density estimation.");
{
double[] scratch = new double[2 * minwindow + 1];
int begin = 0;
int halfw = (minwindow + 1) >> 1;
iter.seek(0);
// Fill initial buffer.
for (int i = 0; i < size; i++, iter.advance()) {
final int m = i % scratch.length, t = (i - minwindow - 1) % scratch.length;
scratch[m] = density.doubleValue(iter);
if (i > scratch.length) {
double min = Double.POSITIVE_INFINITY;
for (int j = 0; j < scratch.length; j++) {
if (j != t && scratch[j] < min) {
min = scratch[j];
}
}
// Local minimum:
if (scratch[t] < min) {
int end = i - minwindow + 1;
{
// Test on which side the kNN is
iter2.seek(end);
double curv = relation.get(iter2).doubleValue(dim);
iter2.seek(end - halfw);
double left = relation.get(iter2).doubleValue(dim) - curv;
iter2.seek(end + halfw);
double right = curv - relation.get(iter2).doubleValue(dim);
if (left < right) {
end++;
}
}
iter2.seek(begin);
ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin);
for (int j = 0; j < end - begin; j++, iter2.advance()) {
cids.add(iter2);
}
clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER));
begin = end;
}
}
}
// Extract last cluster
int end = size;
iter2.seek(begin);
ArrayModifiableDBIDs cids = DBIDUtil.newArray(end - begin);
for (int j = 0; j < end - begin; j++, iter2.advance()) {
cids.add(iter2);
}
clustering.addToplevelCluster(new Cluster<>(cids, ClusterModel.CLUSTER));
}
LOG.ensureCompleted(sprog);
return clustering;
}
use of de.lmu.ifi.dbs.elki.data.Clustering in project elki by elki-project.
the class ByLabelClustering method run.
/**
* Run the actual clustering algorithm.
*
* @param relation The data input we use
*/
public Clustering<Model> run(Relation<?> relation) {
HashMap<String, DBIDs> labelMap = multiple ? multipleAssignment(relation) : singleAssignment(relation);
ModifiableDBIDs noiseids = DBIDUtil.newArray();
Clustering<Model> result = new Clustering<>("By Label Clustering", "bylabel-clustering");
for (Entry<String, DBIDs> entry : labelMap.entrySet()) {
DBIDs ids = entry.getValue();
if (ids.size() <= 1) {
noiseids.addDBIDs(ids);
continue;
}
// Build a cluster
Cluster<Model> c = new Cluster<Model>(entry.getKey(), ids, ClusterModel.CLUSTER);
if (noisepattern != null && noisepattern.matcher(entry.getKey()).find()) {
c.setNoise(true);
}
result.addToplevelCluster(c);
}
// Collected noise IDs.
if (noiseids.size() > 0) {
Cluster<Model> c = new Cluster<Model>("Noise", noiseids, ClusterModel.CLUSTER);
c.setNoise(true);
result.addToplevelCluster(c);
}
return result;
}
Aggregations