use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.
the class CASH method run.
/**
* Run CASH on the relation.
*
* @param database Database
* @param vrel Relation
* @return Clustering result
*/
public Clustering<Model> run(Database database, Relation<V> vrel) {
fulldatabase = preprocess(database, vrel);
processedIDs = DBIDUtil.newHashSet(fulldatabase.size());
noiseDim = dimensionality(fulldatabase);
FiniteProgress progress = LOG.isVerbose() ? new FiniteProgress("CASH Clustering", fulldatabase.size(), LOG) : null;
Clustering<Model> result = doRun(fulldatabase, progress);
LOG.ensureCompleted(progress);
if (LOG.isVerbose()) {
StringBuilder msg = new StringBuilder(1000);
for (Cluster<Model> c : result.getAllClusters()) {
if (c.getModel() instanceof LinearEquationModel) {
LinearEquationModel s = (LinearEquationModel) c.getModel();
msg.append("\n Cluster: Dim: " + s.getLes().subspacedim() + " size: " + c.size());
} else {
msg.append("\n Cluster: " + c.getModel().getClass().getName() + " size: " + c.size());
}
}
LOG.verbose(msg.toString());
}
return result;
}
use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.
the class ERiC method extractCorrelationClusters.
/**
* Extracts the correlation clusters and noise from the copac result and
* returns a mapping of correlation dimension to maps of clusters within this
* correlation dimension. Each cluster is defined by the basis vectors
* defining the subspace in which the cluster appears.
*
* @param dbscanResult
*
* @param relation the database containing the objects
* @param dimensionality the dimensionality of the feature space
* @param npred ERiC predicate
* @return a list of clusters for each dimensionality
*/
private List<List<Cluster<CorrelationModel>>> extractCorrelationClusters(Clustering<Model> dbscanResult, Relation<V> relation, int dimensionality, ERiCNeighborPredicate<V>.Instance npred) {
// result
List<List<Cluster<CorrelationModel>>> clusterMap = new ArrayList<>();
for (int i = 0; i <= dimensionality; i++) {
clusterMap.add(new ArrayList<Cluster<CorrelationModel>>());
}
// noise cluster containing all noise objects over all partitions
Cluster<Model> noise = null;
// iterate over correlation dimensions
for (Cluster<Model> clus : dbscanResult.getAllClusters()) {
DBIDs group = clus.getIDs();
int dim = clus.isNoise() ? dimensionality : npred.dimensionality(clus.getIDs().iter());
if (dim < dimensionality) {
EigenPairFilter filter = new FirstNEigenPairFilter(dim);
// get cluster list for this dimension.
List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(dim);
SortedEigenPairs epairs = settings.pca.processIds(group, relation).getEigenPairs();
int numstrong = filter.filter(epairs.eigenValues());
PCAFilteredResult pcares = new PCAFilteredResult(epairs, numstrong, 1., 0.);
double[] centroid = Centroid.make(relation, group).getArrayRef();
Cluster<CorrelationModel> correlationCluster = new Cluster<>("[" + dim + "_" + correlationClusters.size() + "]", group, new CorrelationModel(pcares, centroid));
correlationClusters.add(correlationCluster);
} else // partition containing noise
{
if (noise == null) {
noise = clus;
} else {
ModifiableDBIDs merged = DBIDUtil.newHashSet(noise.getIDs());
merged.addDBIDs(clus.getIDs());
noise.setIDs(merged);
}
}
}
if (noise != null && noise.size() > 0) {
// get cluster list for this dimension.
List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(dimensionality);
EigenPairFilter filter = new FirstNEigenPairFilter(dimensionality);
SortedEigenPairs epairs = settings.pca.processIds(noise.getIDs(), relation).getEigenPairs();
int numstrong = filter.filter(epairs.eigenValues());
PCAFilteredResult pcares = new PCAFilteredResult(epairs, numstrong, 1., 0.);
double[] centroid = Centroid.make(relation, noise.getIDs()).getArrayRef();
Cluster<CorrelationModel> correlationCluster = new Cluster<>("[noise]", noise.getIDs(), new CorrelationModel(pcares, centroid));
correlationClusters.add(correlationCluster);
}
// Delete dimensionalities not found.
for (int i = dimensionality; i > 0; i--) {
if (!clusterMap.get(i).isEmpty()) {
break;
}
clusterMap.remove(i);
}
return clusterMap;
}
use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.
the class ERiC method run.
/**
* Performs the ERiC algorithm on the given database.
*
* @param relation Relation to process
* @return Clustering result
*/
public Clustering<CorrelationModel> run(Database database, Relation<V> relation) {
final int dimensionality = RelationUtil.dimensionality(relation);
StepProgress stepprog = LOG.isVerbose() ? new StepProgress(3) : null;
// Run Generalized DBSCAN
LOG.beginStep(stepprog, 1, "Preprocessing local correlation dimensionalities and partitioning data");
// FIXME: how to ensure we are running on the same relation?
ERiCNeighborPredicate<V>.Instance npred = new ERiCNeighborPredicate<V>(settings).instantiate(database, relation);
CorePredicate.Instance<DBIDs> cpred = new MinPtsCorePredicate(settings.minpts).instantiate(database);
Clustering<Model> copacResult = new GeneralizedDBSCAN.Instance<>(npred, cpred, false).run();
// extract correlation clusters
LOG.beginStep(stepprog, 2, "Extract correlation clusters");
List<List<Cluster<CorrelationModel>>> clusterMap = extractCorrelationClusters(copacResult, relation, dimensionality, npred);
if (LOG.isDebugging()) {
StringBuilder msg = new StringBuilder("Step 2: Extract correlation clusters...");
for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(corrDim);
msg.append("\n\ncorrDim ").append(corrDim);
for (Cluster<CorrelationModel> cluster : correlationClusters) {
msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
// .append(", level: ").append(cluster.getLevel()).append(", index:
// ").append(cluster.getLevelIndex());
// msg.append("\n basis " +
// cluster.getPCA().getWeakEigenvectors().toString(" ", NF) +
// " ids " + cluster.getIDs().size());
}
}
LOG.debugFine(msg.toString());
}
if (LOG.isVerbose()) {
int clusters = 0;
for (List<Cluster<CorrelationModel>> correlationClusters : clusterMap) {
clusters += correlationClusters.size();
}
LOG.verbose(clusters + " clusters extracted.");
}
// build hierarchy
LOG.beginStep(stepprog, 3, "Building hierarchy");
Clustering<CorrelationModel> clustering = new Clustering<>("ERiC clustering", "eric-clustering");
buildHierarchy(clustering, clusterMap, npred);
if (LOG.isDebugging()) {
StringBuilder msg = new StringBuilder("Step 3: Build hierarchy");
for (int corrDim = 0; corrDim < clusterMap.size(); corrDim++) {
List<Cluster<CorrelationModel>> correlationClusters = clusterMap.get(corrDim);
for (Cluster<CorrelationModel> cluster : correlationClusters) {
msg.append("\n cluster ").append(cluster).append(", ids: ").append(cluster.getIDs().size());
// ").append(cluster.getLevelIndex());
for (It<Cluster<CorrelationModel>> iter = clustering.getClusterHierarchy().iterParents(cluster); iter.valid(); iter.advance()) {
msg.append("\n parent ").append(iter.get());
}
for (It<Cluster<CorrelationModel>> iter = clustering.getClusterHierarchy().iterChildren(cluster); iter.valid(); iter.advance()) {
msg.append("\n child ").append(iter.get());
}
}
}
LOG.debugFine(msg.toString());
}
LOG.setCompleted(stepprog);
for (Cluster<CorrelationModel> rc : clusterMap.get(clusterMap.size() - 1)) {
clustering.addToplevelCluster(rc);
}
return clustering;
}
use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.
the class NaiveAgglomerativeHierarchicalClustering3 method run.
/**
* Run the algorithm
*
* @param db Database
* @param relation Relation
* @return Clustering hierarchy
*/
public Result run(Database db, Relation<O> relation) {
DistanceQuery<O> dq = db.getDistanceQuery(relation, getDistanceFunction());
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
final int size = ids.size();
if (size > 0x10000) {
throw new AbortException("This implementation does not scale to data sets larger than " + 0x10000 + " instances (~17 GB RAM), which results in an integer overflow.");
}
if (Linkage.SINGLE.equals(linkage)) {
LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
}
// Compute the initial (lower triangular) distance matrix.
double[] scratch = new double[triangleSize(size)];
DBIDArrayIter ix = ids.iter(), iy = ids.iter();
// Position counter - must agree with computeOffset!
int pos = 0;
boolean square = Linkage.WARD.equals(linkage) && !getDistanceFunction().isSquared();
for (int x = 0; ix.valid(); x++, ix.advance()) {
iy.seek(0);
for (int y = 0; y < x; y++, iy.advance()) {
scratch[pos] = dq.distance(ix, iy);
// Ward uses variances -- i.e. squared values
if (square) {
scratch[pos] *= scratch[pos];
}
pos++;
}
}
// Initialize space for result:
double[] height = new double[size];
Arrays.fill(height, Double.POSITIVE_INFINITY);
// Parent node, to track merges
// have every object point to itself initially
ArrayModifiableDBIDs parent = DBIDUtil.newArray(ids);
// Active clusters, when not trivial.
Int2ReferenceMap<ModifiableDBIDs> clusters = new Int2ReferenceOpenHashMap<>();
// Repeat until everything merged, except the desired number of clusters:
final int stop = size - numclusters;
FiniteProgress prog = LOG.isVerbose() ? new FiniteProgress("Agglomerative clustering", stop, LOG) : null;
for (int i = 0; i < stop; i++) {
double min = Double.POSITIVE_INFINITY;
int minx = -1, miny = -1;
for (int x = 0; x < size; x++) {
if (height[x] < Double.POSITIVE_INFINITY) {
continue;
}
final int xbase = triangleSize(x);
for (int y = 0; y < x; y++) {
if (height[y] < Double.POSITIVE_INFINITY) {
continue;
}
final int idx = xbase + y;
if (scratch[idx] < min) {
min = scratch[idx];
minx = x;
miny = y;
}
}
}
assert (minx >= 0 && miny >= 0);
// Avoid allocating memory, by reusing existing iterators:
ix.seek(minx);
iy.seek(miny);
// Perform merge in data structure: x -> y
// Since y < x, prefer keeping y, dropping x.
height[minx] = min;
parent.set(minx, iy);
// Merge into cluster
ModifiableDBIDs cx = clusters.get(minx);
ModifiableDBIDs cy = clusters.get(miny);
// cluster sizes, for averaging
int sizex = 1, sizey = 1;
if (cy == null) {
cy = DBIDUtil.newHashSet();
cy.add(iy);
} else {
sizey = cy.size();
}
if (cx == null) {
cy.add(ix);
} else {
sizex = cx.size();
cy.addDBIDs(cx);
clusters.remove(minx);
}
clusters.put(miny, cy);
// Update distance matrix. Note: miny < minx
// Implementation note: most will not need sizej, and could save the
// hashmap lookup.
final int xbase = triangleSize(minx), ybase = triangleSize(miny);
// Write to (y, j), with j < y
for (int j = 0; j < miny; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final DBIDs idsj = clusters.get(j);
final int sizej = (idsj == null) ? 1 : idsj.size();
scratch[ybase + j] = linkage.combine(sizex, scratch[xbase + j], sizey, scratch[ybase + j], sizej, min);
}
// Write to (j, y), with y < j < x
for (int j = miny + 1; j < minx; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final int jbase = triangleSize(j);
final DBIDs idsj = clusters.get(j);
final int sizej = (idsj == null) ? 1 : idsj.size();
scratch[jbase + miny] = linkage.combine(sizex, scratch[xbase + j], sizey, scratch[jbase + miny], sizej, min);
}
// Write to (j, y), with y < x < j
for (int j = minx + 1; j < size; j++) {
if (height[j] < Double.POSITIVE_INFINITY) {
continue;
}
final DBIDs idsj = clusters.get(j);
final int sizej = (idsj == null) ? 1 : idsj.size();
final int jbase = triangleSize(j);
scratch[jbase + miny] = linkage.combine(sizex, scratch[jbase + minx], sizey, scratch[jbase + miny], sizej, min);
}
LOG.incrementProcessed(prog);
}
LOG.ensureCompleted(prog);
// Build the clustering result
final Clustering<Model> dendrogram = new Clustering<>("Hierarchical-Clustering", "hierarchical-clustering");
for (int x = 0; x < size; x++) {
if (height[x] < Double.POSITIVE_INFINITY) {
DBIDs cids = clusters.get(x);
if (cids == null) {
ix.seek(x);
cids = DBIDUtil.deref(ix);
}
Cluster<Model> cluster = new Cluster<>("Cluster", cids);
dendrogram.addToplevelCluster(cluster);
}
}
return dendrogram;
}
use of de.lmu.ifi.dbs.elki.data.model.Model in project elki by elki-project.
the class DBSCAN method run.
/**
* Performs the DBSCAN algorithm on the given database.
*/
public Clustering<Model> run(Relation<O> relation) {
final int size = relation.size();
if (size < minpts) {
Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
result.addToplevelCluster(new Cluster<Model>(relation.getDBIDs(), true, ClusterModel.CLUSTER));
return result;
}
RangeQuery<O> rangeQuery = QueryUtil.getRangeQuery(relation, getDistanceFunction());
resultList = new ArrayList<>();
noise = DBIDUtil.newHashSet();
runDBSCAN(relation, rangeQuery);
double averagen = ncounter / (double) relation.size();
LOG.statistics(new DoubleStatistic(DBSCAN.class.getName() + ".average-neighbors", averagen));
if (averagen < 1 + 0.1 * (minpts - 1)) {
LOG.warning("There are very few neighbors found. Epsilon may be too small.");
}
if (averagen > 100 * minpts) {
LOG.warning("There are very many neighbors found. Epsilon may be too large.");
}
Clustering<Model> result = new Clustering<>("DBSCAN Clustering", "dbscan-clustering");
for (ModifiableDBIDs res : resultList) {
result.addToplevelCluster(new Cluster<Model>(res, ClusterModel.CLUSTER));
}
result.addToplevelCluster(new Cluster<Model>(noise, true, ClusterModel.CLUSTER));
return result;
}
Aggregations