Search in sources :

Example 1 with SparseNumberVector

use of de.lmu.ifi.dbs.elki.data.SparseNumberVector in project elki by elki-project.

the class TermFrequencyParserTest method testDBLPData.

@Test
public void testDBLPData() throws IOException {
    InputStream is = AbstractSimpleAlgorithmTest.open(DBLP_DATA);
    // Setup parser and data loading
    TermFrequencyParser<SparseDoubleVector> parser = new TermFrequencyParser<>(false, SparseDoubleVector.FACTORY);
    InputStreamDatabaseConnection dbc = new InputStreamDatabaseConnection(is, null, parser);
    ListParameterization config = new ListParameterization();
    config.addParameter(AbstractDatabase.Parameterizer.DATABASE_CONNECTION_ID, dbc);
    Database db = ClassGenericsUtil.parameterizeOrAbort(StaticArrayDatabase.class, config);
    if (config.hasUnusedParameters()) {
        fail("Unused parameters: " + config.getRemainingParameters());
    }
    if (config.hasErrors()) {
        config.logAndClearReportedErrors();
        fail("Parameterization errors.");
    }
    db.initialize();
    Relation<SparseNumberVector> rel = db.getRelation(TypeUtil.SPARSE_VECTOR_VARIABLE_LENGTH);
    // Get first three objects:
    DBIDIter iter = rel.iterDBIDs();
    SparseNumberVector v1 = rel.get(iter);
    iter.advance();
    SparseNumberVector v2 = rel.get(iter);
    iter.advance();
    SparseNumberVector v3 = rel.get(iter);
    // "Dense" euclidean distance:
    double euclid1_12 = EuclideanDistanceFunction.STATIC.distance(v1, v2);
    double euclid1_13 = EuclideanDistanceFunction.STATIC.distance(v1, v3);
    double euclid1_23 = EuclideanDistanceFunction.STATIC.distance(v2, v3);
    double euclid1_21 = EuclideanDistanceFunction.STATIC.distance(v2, v1);
    // Sparse euclidean distance:
    double euclid2_12 = SparseEuclideanDistanceFunction.STATIC.distance(v1, v2);
    double euclid2_13 = SparseEuclideanDistanceFunction.STATIC.distance(v1, v3);
    double euclid2_23 = SparseEuclideanDistanceFunction.STATIC.distance(v2, v3);
    double euclid2_21 = SparseEuclideanDistanceFunction.STATIC.distance(v2, v1);
    // (Auto-switching) angular distance:
    double arccos_12 = ArcCosineDistanceFunction.STATIC.distance(v1, v2);
    double arccos_13 = ArcCosineDistanceFunction.STATIC.distance(v1, v3);
    double arccos_23 = ArcCosineDistanceFunction.STATIC.distance(v2, v3);
    double arccos_21 = ArcCosineDistanceFunction.STATIC.distance(v2, v1);
    assertEquals("Euclidean self-distance is not 0.", 0., EuclideanDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
    assertEquals("Sparse Euclidean self-distance is not 0.", 0., SparseEuclideanDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
    assertEquals("Arccos self-distance is not 0.", 0., ArcCosineDistanceFunction.STATIC.distance(v1, v1), Double.MIN_VALUE);
    assertEquals("Euclidean distance not symmetric.", euclid1_12, euclid1_21, Double.MIN_VALUE);
    assertEquals("Sparse Euclidean distance not symmetric.", euclid2_12, euclid2_21, Double.MIN_VALUE);
    assertEquals("Arccos distance not symmetric.", arccos_12, arccos_21, Double.MIN_VALUE);
    assertEquals("Euclidean distance 1-2 not as expected.", 684.4165398352088, euclid1_12, 1e-20);
    assertEquals("Sparse Euclidean distance 1-2 not as expected.", 684.4165398352088, euclid2_12, 1e-20);
    assertEquals("Arccos distance 1-2 not as expected.", 0.1901934493141418, arccos_12, 1e-20);
    assertEquals("Euclidean distance 1-3 not as expected.", 654.9862593978594, euclid1_13, 1e-20);
    assertEquals("Sparse Euclidean distance 1-3 not as expected.", 654.9862593978594, euclid2_13, 1e-20);
    assertEquals("Arccos distance 1-3 not as expected.", 0.18654347641726046, arccos_13, 1e-20);
    assertEquals("Euclidean distance 2-3 not as expected.", 231.78653972998518, euclid1_23, 1e-20);
    assertEquals("Sparse Euclidean distance 2-3 not as expected.", 231.78653972998518, euclid2_23, 1e-20);
    assertEquals("Arccos distance 2-3 not as expected.", 0.11138352337990569, arccos_23, 1e-20);
}
Also used : InputStream(java.io.InputStream) Database(de.lmu.ifi.dbs.elki.database.Database) AbstractDatabase(de.lmu.ifi.dbs.elki.database.AbstractDatabase) StaticArrayDatabase(de.lmu.ifi.dbs.elki.database.StaticArrayDatabase) SparseNumberVector(de.lmu.ifi.dbs.elki.data.SparseNumberVector) InputStreamDatabaseConnection(de.lmu.ifi.dbs.elki.datasource.InputStreamDatabaseConnection) SparseDoubleVector(de.lmu.ifi.dbs.elki.data.SparseDoubleVector) ListParameterization(de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter) Test(org.junit.Test) AbstractSimpleAlgorithmTest(de.lmu.ifi.dbs.elki.algorithm.AbstractSimpleAlgorithmTest)

Example 2 with SparseNumberVector

use of de.lmu.ifi.dbs.elki.data.SparseNumberVector in project elki by elki-project.

the class AbstractKMeans method sparseMeans.

/**
 * Returns the mean vectors of the given clusters in the given database.
 *
 * @param clusters the clusters to compute the means
 * @param means the recent means
 * @param relation the database containing the vectors
 * @return the mean vectors of the given clusters in the given database
 */
private static double[][] sparseMeans(List<? extends DBIDs> clusters, double[][] means, Relation<? extends SparseNumberVector> relation) {
    final int k = means.length;
    double[][] newMeans = new double[k][];
    for (int i = 0; i < k; i++) {
        DBIDs list = clusters.get(i);
        if (list.isEmpty()) {
            // Keep degenerated means as-is for now.
            newMeans[i] = means[i];
            continue;
        }
        DBIDIter iter = list.iter();
        // Initialize with first.
        double[] mean = relation.get(iter).toArray();
        // Update with remaining instances
        for (iter.advance(); iter.valid(); iter.advance()) {
            SparseNumberVector vec = relation.get(iter);
            for (int j = vec.iter(); vec.iterValid(j); j = vec.iterAdvance(j)) {
                mean[vec.iterDim(j)] += vec.iterDoubleValue(j);
            }
        }
        newMeans[i] = timesEquals(mean, 1.0 / list.size());
    }
    return newMeans;
}
Also used : SparseNumberVector(de.lmu.ifi.dbs.elki.data.SparseNumberVector)

Example 3 with SparseNumberVector

use of de.lmu.ifi.dbs.elki.data.SparseNumberVector in project elki by elki-project.

the class InMemoryInvertedIndex method initialize.

@Override
public void initialize() {
    if (index != null) {
        LOG.warning("Index was already initialized!");
    }
    index = new ArrayList<>();
    length = DataStoreUtil.makeDoubleStorage(relation.getDBIDs(), DataStoreFactory.HINT_DB);
    for (DBIDIter iter = relation.iterDBIDs(); iter.valid(); iter.advance()) {
        V obj = relation.get(iter);
        if (obj instanceof SparseNumberVector) {
            indexSparse(iter, (SparseNumberVector) obj);
        } else {
            indexDense(iter, obj);
        }
    }
    // Sort indexes
    long count = 0L;
    for (ModifiableDoubleDBIDList column : index) {
        column.sort();
        count += column.size();
    }
    double sparsity = count / (index.size() * (double) relation.size());
    if (sparsity > .2) {
        LOG.warning("Inverted list indexes only perform well for very sparse data. Your data set has a sparsity of " + sparsity);
    }
}
Also used : ModifiableDoubleDBIDList(de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList) SparseNumberVector(de.lmu.ifi.dbs.elki.data.SparseNumberVector) DBIDIter(de.lmu.ifi.dbs.elki.database.ids.DBIDIter)

Aggregations

SparseNumberVector (de.lmu.ifi.dbs.elki.data.SparseNumberVector)3 DBIDIter (de.lmu.ifi.dbs.elki.database.ids.DBIDIter)2 AbstractSimpleAlgorithmTest (de.lmu.ifi.dbs.elki.algorithm.AbstractSimpleAlgorithmTest)1 SparseDoubleVector (de.lmu.ifi.dbs.elki.data.SparseDoubleVector)1 AbstractDatabase (de.lmu.ifi.dbs.elki.database.AbstractDatabase)1 Database (de.lmu.ifi.dbs.elki.database.Database)1 StaticArrayDatabase (de.lmu.ifi.dbs.elki.database.StaticArrayDatabase)1 ModifiableDoubleDBIDList (de.lmu.ifi.dbs.elki.database.ids.ModifiableDoubleDBIDList)1 InputStreamDatabaseConnection (de.lmu.ifi.dbs.elki.datasource.InputStreamDatabaseConnection)1 ListParameterization (de.lmu.ifi.dbs.elki.utilities.optionhandling.parameterization.ListParameterization)1 InputStream (java.io.InputStream)1 Test (org.junit.Test)1