use of de.lmu.ifi.dbs.elki.data.SparseNumberVector in project elki by elki-project.
the class TermFrequencyParserTest method testDBLPData.
@Test
public void testDBLPData() throws IOException {
InputStream is = AbstractSimpleAlgorithmTest.open(DBLP_DATA);
// Setup parser and data loading
TermFrequencyParser<SparseDoubleVector> parser = new TermFrequencyParser<>(false, SparseDoubleVector.FACTORY);
InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, null, parser);
ListParameterization config = new ListParameterization();
config.addParameter(AbstractDatabase.Parameterizer.DATABASE_CONNECTION_ID, dbc);
Database db = ClassGenericsUtil.parameterizeOrAbort(StaticArrayDatabase.class, config);
if (config.hasUnusedParameters()) {
fail("Unused parameters: " + config.getRemainingParameters());
}
if (config.hasErrors()) {
config.logAndClearReportedErrors();
fail("Parameterization errors.");
}
db.initialize();
Relation<SparseNumberVector> rel = db.getRelation(TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH);
// Get first three objects:
DBIDIter iter = rel.iterDBIDs();
SparseNumberVector v1 = rel.get(iter);
iter.advance();
SparseNumberVector v2 = rel.get(iter);
iter.advance();
SparseNumberVector v3 = rel.get(iter);
// "Dense" euclidean distance:
double euclid1_12 = EuclideanDistanceFunction.STATIC.distance(v1, v2);
double euclid1_13 = EuclideanDistanceFunction.STATIC.distance(v1, v3);
double euclid1_23 = EuclideanDistanceFunction.STATIC.distance(v2, v3);
double euclid1_21 = EuclideanDistanceFunction.STATIC.distance(v2, v1);
// Sparse euclidean distance:
double euclid2_12 = SparseEuclideanDistanceFunction.STATIC.distance(v1, v2);
double euclid2_13 = SparseEuclideanDistanceFunction.STATIC.distance(v1, v3);
double euclid2_23 = SparseEuclideanDistanceFunction.STATIC.distance(v2, v3);
double euclid2_21 = SparseEuclideanDistanceFunction.STATIC.distance(v2, v1);
// (Auto-switching) angular distance:
double arccos_12 = ArcCosineDistanceFunction.STATIC.distance(v1, v2);
double arccos_13 = ArcCosineDistanceFunction.STATIC.distance(v1, v3);
double arccos_23 = ArcCosineDistanceFunction.STATIC.distance(v2, v3);
double arccos_21 = ArcCosineDistanceFunction.STATIC.distance(v2, v1);
assertEquals("Euclidean self-distance is not 0.", 0., EuclideanDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
assertEquals("Sparse Euclidean self-distance is not 0.", 0., SparseEuclideanDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
assertEquals("Arccos self-distance is not 0.", 0., ArcCosineDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
assertEquals("Euclidean distance not symmetric.", euclid1_12, euclid1_21, Double.MIN_VALUE);
assertEquals("Sparse Euclidean distance not symmetric.", euclid2_12, euclid2_21, Double.MIN_VALUE);
assertEquals("Arccos distance not symmetric.", arccos_12, arccos_21, Double.MIN_VALUE);
assertEquals("Euclidean distance 1-2 not as expected.", 684.4165398352088, euclid1_12, 1e-20);
assertEquals("Sparse Euclidean distance 1-2 not as expected.", 684.4165398352088, euclid2_12, 1e-20);
assertEquals("Arccos distance 1-2 not as expected.", 0.1901934493141418, arccos_12, 1e-20);
assertEquals("Euclidean distance 1-3 not as expected.", 654.9862593978594, euclid1_13, 1e-20);
assertEquals("Sparse Euclidean distance 1-3 not as expected.", 654.9862593978594, euclid2_13, 1e-20);
assertEquals("Arccos distance 1-3 not as expected.", 0.18654347641726046, arccos_13, 1e-20);
assertEquals("Euclidean distance 2-3 not as expected.", 231.78653972998518, euclid1_23, 1e-20);
assertEquals("Sparse Euclidean distance 2-3 not as expected.", 231.78653972998518, euclid2_23, 1e-20);
assertEquals("Arccos distance 2-3 not as expected.", 0.11138352337990569, arccos_23, 1e-20);
}
use of de.lmu.ifi.dbs.elki.data.SparseNumberVector in project elki by elki-project.
the class AbstractKMeans method sparseMeans.
/**
* Returns the mean vectors of the given clusters in the given database.
*
* @param clusters the clusters to compute the means
* @param means the recent means
* @param relation the database containing the vectors
* @return the mean vectors of the given clusters in the given database
*/
private static double[][] sparseMeans(List<? extends DBIDs> clusters, double[][] means, Relation<? extends SparseNumberVector> relation) {
final int k = means.length;
double[][] newMeans = new double[k][];
for (int i = 0; i < k; i++) {
DBIDs list = clusters.get(i);
if (list.isEmpty()) {
// Keep degenerated means as-is for now.
newMeans[i] = means[i];
continue;
}
DBIDIter iter = list.iter();
// Initialize with first.
double[] mean = relation.get(iter).toArray();
// Update with remaining instances
for (iter.advance(); iter.valid(); iter.advance()) {
SparseNumberVector vec = relation.get(iter);
for (int j = vec.iter(); vec.iterValid(j); j = vec.iterAdvance(j)) {
mean[vec.iterDim(j)] += vec.iterDoubleValue(j);
}
}
newMeans[i] = timesEquals(mean, 1.0 / list.size());
}
return newMeans;
}
use of de.lmu.ifi.dbs.elki.data.SparseNumberVector in project elki by elki-project.
the class InMemoryInvertedIndex method initialize.
@Override
public void initialize() {
if (index != null) {
LOG.warning("Index was already initialized!");
}
index = new ArrayList<>();
length = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_DB);
for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
V obj = relation.get(iter);
if (obj instanceof SparseNumberVector) {
indexSparse(iter, (SparseNumberVector) obj);
} else {
indexDense(iter, obj);
}
}
// Sort indexes
long count = 0L;
for (ModifiableDoubleDBIDList column : index) {
column.sort();
count += column.size();
}
double sparsity = count / (index.size() * (double) relation.size());
if (sparsity > .2) {
LOG.warning("Inverted list indexes only perform well for very sparse data. Your data set has a sparsity of " + sparsity);
}
}
Aggregations