Search in sources :

Example 1 with DoubleSummary

use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.

the class JaccardSimilarityTest method checkSimilarity2.

/**
 * Enable printing on this test and you will see that the distribution is pretty tight,
 * about +/- 0.7%, which is pretty good since the accuracy of the underlying sketch is about
 * +/- 1.56%.
 */
@Test
public void checkSimilarity2() {
    // tuple, theta
    int minK = 1 << 12;
    int u1 = 1 << 20;
    int u2 = (int) (u1 * 0.95);
    double threshold = 0.943;
    println("Estimation Mode, minK: " + minK + "\t Th: " + threshold);
    final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(minK).build();
    final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build();
    for (int i = 0; i < u1; i++) {
        expected.update(i);
    }
    for (int i = 0; i < u2; i++) {
        measured.update(i, constSummary);
    }
    double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    boolean state = similarityTest(measured, expected, factory.newSummary(), dsso, threshold);
    println(state + "\t" + jaccardString(jResults));
    assertTrue(state);
    // check identity case
    state = similarityTest(measured, measured, dsso, threshold);
    assertTrue(state);
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) JaccardSimilarity.similarityTest(org.apache.datasketches.tuple.JaccardSimilarity.similarityTest) Test(org.testng.annotations.Test) JaccardSimilarity.dissimilarityTest(org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)

Example 2 with DoubleSummary

use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.

the class JaccardSimilarityTest method checkNullsEmpties2.

@Test
public void checkNullsEmpties2() {
    // tuple, theta
    int minK = 1 << 12;
    double threshold = 0.95;
    println("Check nulls & empties, minK: " + minK + "\t Th: " + threshold);
    // check both null
    double[] jResults = jaccard(null, null, factory.newSummary(), dsso);
    boolean state = jResults[1] > threshold;
    println("null \t null:\t" + state + "\t" + jaccardString(jResults));
    assertFalse(state);
    state = exactlyEqual(null, null, factory.newSummary(), dsso);
    assertFalse(state);
    final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(minK).build();
    final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build();
    // check both empty
    jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    state = jResults[1] > threshold;
    println("empty\tempty:\t" + state + "\t" + jaccardString(jResults));
    assertTrue(state);
    state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
    assertTrue(state);
    state = exactlyEqual(measured, measured, dsso);
    assertTrue(state);
    // adjust one
    expected.update(1);
    jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    state = jResults[1] > threshold;
    println("empty\t    1:\t" + state + "\t" + jaccardString(jResults));
    assertFalse(state);
    state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
    assertFalse(state);
    println("");
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) JaccardSimilarity.similarityTest(org.apache.datasketches.tuple.JaccardSimilarity.similarityTest) Test(org.testng.annotations.Test) JaccardSimilarity.dissimilarityTest(org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)

Example 3 with DoubleSummary

use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.

the class JaccardSimilarityTest method checkDissimilarity2.

/**
 * Enable printing on this test and you will see that the distribution is much looser,
 * about +/- 14%.  This is due to the fact that intersections loose accuracy as the ratio of
 * intersection to the union becomes a small number.
 */
@Test
public void checkDissimilarity2() {
    // tuple, theta
    int minK = 1 << 12;
    int u1 = 1 << 20;
    int u2 = (int) (u1 * 0.05);
    double threshold = 0.061;
    println("Estimation Mode, minK: " + minK + "\t Th: " + threshold);
    final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(minK).setNominalEntries(minK).build();
    final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build();
    for (int i = 0; i < u1; i++) {
        expected.update(i);
    }
    for (int i = 0; i < u2; i++) {
        measured.update(i, constSummary);
    }
    double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    boolean state = dissimilarityTest(measured, expected, factory.newSummary(), dsso, threshold);
    println(state + "\t" + jaccardString(jResults));
    assertTrue(state);
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) JaccardSimilarity.similarityTest(org.apache.datasketches.tuple.JaccardSimilarity.similarityTest) Test(org.testng.annotations.Test) JaccardSimilarity.dissimilarityTest(org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)

Example 4 with DoubleSummary

use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.

the class JaccardSimilarityTest method checkEstMode2.

@Test
public void checkEstMode2() {
    // tuple, theta
    int k = 1 << 12;
    int u = 1 << 20;
    double threshold = 0.9999;
    println("Estimation Mode, minK: " + k + "\t Th: " + threshold);
    final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(k).build();
    final UpdateSketch expected = thetaBldr.setNominalEntries(k).build();
    for (int i = 0; i < u; i++) {
        measured.update(i, constSummary);
        expected.update(i);
    }
    double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    boolean state = jResults[1] > threshold;
    println(state + "\t" + jaccardString(jResults));
    assertTrue(state);
    state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
    assertTrue(state);
    for (int i = u; i < (u + 50); i++) {
        // empirically determined
        measured.update(i, constSummary);
    }
    jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    state = jResults[1] >= threshold;
    println(state + "\t" + jaccardString(jResults));
    assertFalse(state);
    state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
    assertFalse(state);
    println("");
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) JaccardSimilarity.similarityTest(org.apache.datasketches.tuple.JaccardSimilarity.similarityTest) Test(org.testng.annotations.Test) JaccardSimilarity.dissimilarityTest(org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)

Example 5 with DoubleSummary

use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.

the class TupleExamples2Test method example5.

@Test
public void example5() {
    // stateful, tuple, theta, Mode=sum for both, use dsso1
    // Load source sketches
    final UpdatableSketch<Double, DoubleSummary> tupleSk = tupleBldr.build();
    final UpdateSketch thetaSk = thetaBldr.build();
    for (int i = 1; i <= 12; i++) {
        tupleSk.update(i, 1.0);
        thetaSk.update(i + 3);
    }
    // Union
    final Union<DoubleSummary> union = new Union<>(dsso1);
    union.union(tupleSk);
    union.union(thetaSk, ufactory.newSummary().update(1.0));
    final CompactSketch<DoubleSummary> ucsk = union.getResult();
    int entries = ucsk.getRetainedEntries();
    println("Union Stateful: tuple, theta: " + entries);
    final SketchIterator<DoubleSummary> uiter = ucsk.iterator();
    int counter = 1;
    int twos = 0;
    int ones = 0;
    while (uiter.next()) {
        final int i = (int) uiter.getSummary().getValue();
        // 9 entries = 2, 6 entries = 1
        println(counter++ + ", " + i);
        if (i == 1) {
            ones++;
        }
        if (i == 2) {
            twos++;
        }
    }
    assertEquals(ones, 6);
    assertEquals(twos, 9);
    // Intersection
    final Intersection<DoubleSummary> inter = new Intersection<>(dsso1);
    inter.intersect(tupleSk);
    inter.intersect(thetaSk, ifactory.newSummary().update(1.0));
    final CompactSketch<DoubleSummary> icsk = inter.getResult();
    entries = icsk.getRetainedEntries();
    println("Intersection Stateful: tuple, theta: " + entries);
    final SketchIterator<DoubleSummary> iiter = icsk.iterator();
    counter = 1;
    while (iiter.next()) {
        final int i = (int) iiter.getSummary().getValue();
        // 9 entries = 1
        println(counter++ + ", " + i);
        assertEquals(i, 2);
    }
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) Test(org.testng.annotations.Test)

Aggregations

DoubleSummary (org.apache.datasketches.tuple.adouble.DoubleSummary)18 Test (org.testng.annotations.Test)18 UpdateSketch (org.apache.datasketches.theta.UpdateSketch)12 JaccardSimilarity.dissimilarityTest (org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)6 JaccardSimilarity.similarityTest (org.apache.datasketches.tuple.JaccardSimilarity.similarityTest)6 DoubleSummaryFactory (org.apache.datasketches.tuple.adouble.DoubleSummaryFactory)5 DoubleSummaryDeserializer (org.apache.datasketches.tuple.adouble.DoubleSummaryDeserializer)4 Mode (org.apache.datasketches.tuple.adouble.DoubleSummary.Mode)2 Intersection (org.apache.datasketches.tuple.Intersection)1