Search in sources :

Example 11 with UpdateSketch

use of org.apache.datasketches.theta.UpdateSketch in project sketches-core by DataSketches.

the class JaccardSimilarityTest method checkSimilarity2.

/**
 * Enable printing on this test and you will see that the distribution is pretty tight,
 * about +/- 0.7%, which is pretty good since the accuracy of the underlying sketch is about
 * +/- 1.56%.
 */
@Test
public void checkSimilarity2() {
    // tuple, theta
    int minK = 1 << 12;
    int u1 = 1 << 20;
    int u2 = (int) (u1 * 0.95);
    double threshold = 0.943;
    println("Estimation Mode, minK: " + minK + "\t Th: " + threshold);
    final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(minK).build();
    final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build();
    for (int i = 0; i < u1; i++) {
        expected.update(i);
    }
    for (int i = 0; i < u2; i++) {
        measured.update(i, constSummary);
    }
    double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    boolean state = similarityTest(measured, expected, factory.newSummary(), dsso, threshold);
    println(state + "\t" + jaccardString(jResults));
    assertTrue(state);
    // check identity case
    state = similarityTest(measured, measured, dsso, threshold);
    assertTrue(state);
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) JaccardSimilarity.similarityTest(org.apache.datasketches.tuple.JaccardSimilarity.similarityTest) Test(org.testng.annotations.Test) JaccardSimilarity.dissimilarityTest(org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)

Example 12 with UpdateSketch

use of org.apache.datasketches.theta.UpdateSketch in project sketches-core by DataSketches.

the class JaccardSimilarityTest method checkNullsEmpties2.

@Test
public void checkNullsEmpties2() {
    // tuple, theta
    int minK = 1 << 12;
    double threshold = 0.95;
    println("Check nulls & empties, minK: " + minK + "\t Th: " + threshold);
    // check both null
    double[] jResults = jaccard(null, null, factory.newSummary(), dsso);
    boolean state = jResults[1] > threshold;
    println("null \t null:\t" + state + "\t" + jaccardString(jResults));
    assertFalse(state);
    state = exactlyEqual(null, null, factory.newSummary(), dsso);
    assertFalse(state);
    final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(minK).build();
    final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build();
    // check both empty
    jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    state = jResults[1] > threshold;
    println("empty\tempty:\t" + state + "\t" + jaccardString(jResults));
    assertTrue(state);
    state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
    assertTrue(state);
    state = exactlyEqual(measured, measured, dsso);
    assertTrue(state);
    // adjust one
    expected.update(1);
    jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    state = jResults[1] > threshold;
    println("empty\t    1:\t" + state + "\t" + jaccardString(jResults));
    assertFalse(state);
    state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
    assertFalse(state);
    println("");
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) JaccardSimilarity.similarityTest(org.apache.datasketches.tuple.JaccardSimilarity.similarityTest) Test(org.testng.annotations.Test) JaccardSimilarity.dissimilarityTest(org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)

Example 13 with UpdateSketch

use of org.apache.datasketches.theta.UpdateSketch in project sketches-core by DataSketches.

the class JaccardSimilarityTest method checkDissimilarity2.

/**
 * Enable printing on this test and you will see that the distribution is much looser,
 * about +/- 14%.  This is due to the fact that intersections loose accuracy as the ratio of
 * intersection to the union becomes a small number.
 */
@Test
public void checkDissimilarity2() {
    // tuple, theta
    int minK = 1 << 12;
    int u1 = 1 << 20;
    int u2 = (int) (u1 * 0.05);
    double threshold = 0.061;
    println("Estimation Mode, minK: " + minK + "\t Th: " + threshold);
    final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(minK).setNominalEntries(minK).build();
    final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build();
    for (int i = 0; i < u1; i++) {
        expected.update(i);
    }
    for (int i = 0; i < u2; i++) {
        measured.update(i, constSummary);
    }
    double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    boolean state = dissimilarityTest(measured, expected, factory.newSummary(), dsso, threshold);
    println(state + "\t" + jaccardString(jResults));
    assertTrue(state);
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) JaccardSimilarity.similarityTest(org.apache.datasketches.tuple.JaccardSimilarity.similarityTest) Test(org.testng.annotations.Test) JaccardSimilarity.dissimilarityTest(org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)

Example 14 with UpdateSketch

use of org.apache.datasketches.theta.UpdateSketch in project sketches-core by DataSketches.

the class JaccardSimilarityTest method checkEstMode2.

@Test
public void checkEstMode2() {
    // tuple, theta
    int k = 1 << 12;
    int u = 1 << 20;
    double threshold = 0.9999;
    println("Estimation Mode, minK: " + k + "\t Th: " + threshold);
    final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(k).build();
    final UpdateSketch expected = thetaBldr.setNominalEntries(k).build();
    for (int i = 0; i < u; i++) {
        measured.update(i, constSummary);
        expected.update(i);
    }
    double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    boolean state = jResults[1] > threshold;
    println(state + "\t" + jaccardString(jResults));
    assertTrue(state);
    state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
    assertTrue(state);
    for (int i = u; i < (u + 50); i++) {
        // empirically determined
        measured.update(i, constSummary);
    }
    jResults = jaccard(measured, expected, factory.newSummary(), dsso);
    state = jResults[1] >= threshold;
    println(state + "\t" + jaccardString(jResults));
    assertFalse(state);
    state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
    assertFalse(state);
    println("");
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) JaccardSimilarity.similarityTest(org.apache.datasketches.tuple.JaccardSimilarity.similarityTest) Test(org.testng.annotations.Test) JaccardSimilarity.dissimilarityTest(org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)

Example 15 with UpdateSketch

use of org.apache.datasketches.theta.UpdateSketch in project sketches-core by DataSketches.

the class TupleExamples2Test method example5.

@Test
public void example5() {
    // stateful, tuple, theta, Mode=sum for both, use dsso1
    // Load source sketches
    final UpdatableSketch<Double, DoubleSummary> tupleSk = tupleBldr.build();
    final UpdateSketch thetaSk = thetaBldr.build();
    for (int i = 1; i <= 12; i++) {
        tupleSk.update(i, 1.0);
        thetaSk.update(i + 3);
    }
    // Union
    final Union<DoubleSummary> union = new Union<>(dsso1);
    union.union(tupleSk);
    union.union(thetaSk, ufactory.newSummary().update(1.0));
    final CompactSketch<DoubleSummary> ucsk = union.getResult();
    int entries = ucsk.getRetainedEntries();
    println("Union Stateful: tuple, theta: " + entries);
    final SketchIterator<DoubleSummary> uiter = ucsk.iterator();
    int counter = 1;
    int twos = 0;
    int ones = 0;
    while (uiter.next()) {
        final int i = (int) uiter.getSummary().getValue();
        // 9 entries = 2, 6 entries = 1
        println(counter++ + ", " + i);
        if (i == 1) {
            ones++;
        }
        if (i == 2) {
            twos++;
        }
    }
    assertEquals(ones, 6);
    assertEquals(twos, 9);
    // Intersection
    final Intersection<DoubleSummary> inter = new Intersection<>(dsso1);
    inter.intersect(tupleSk);
    inter.intersect(thetaSk, ifactory.newSummary().update(1.0));
    final CompactSketch<DoubleSummary> icsk = inter.getResult();
    entries = icsk.getRetainedEntries();
    println("Intersection Stateful: tuple, theta: " + entries);
    final SketchIterator<DoubleSummary> iiter = icsk.iterator();
    counter = 1;
    while (iiter.next()) {
        final int i = (int) iiter.getSummary().getValue();
        // 9 entries = 1
        println(counter++ + ", " + i);
        assertEquals(i, 2);
    }
}
Also used : DoubleSummary(org.apache.datasketches.tuple.adouble.DoubleSummary) UpdateSketch(org.apache.datasketches.theta.UpdateSketch) Test(org.testng.annotations.Test)

Aggregations

UpdateSketch (org.apache.datasketches.theta.UpdateSketch)46 Test (org.testng.annotations.Test)42 DoubleSummary (org.apache.datasketches.tuple.adouble.DoubleSummary)12 AnotB (org.apache.datasketches.tuple.AnotB)6 JaccardSimilarity.dissimilarityTest (org.apache.datasketches.tuple.JaccardSimilarity.dissimilarityTest)6 JaccardSimilarity.similarityTest (org.apache.datasketches.tuple.JaccardSimilarity.similarityTest)6 UpdateSketchBuilder (org.apache.datasketches.theta.UpdateSketchBuilder)5 Intersection (org.apache.datasketches.tuple.Intersection)4 MapBasedRow (org.apache.druid.data.input.MapBasedRow)3 TestColumnSelectorFactory (org.apache.druid.query.groupby.epinephelinae.TestColumnSelectorFactory)3 Test (org.junit.Test)3 SketchesArgumentException (org.apache.datasketches.SketchesArgumentException)2 IntegerSummary (org.apache.datasketches.tuple.aninteger.IntegerSummary)2 GroupByQueryRunnerTest (org.apache.druid.query.groupby.GroupByQueryRunnerTest)2 SketchesStateException (org.apache.datasketches.SketchesStateException)1 CompactSketch (org.apache.datasketches.theta.CompactSketch)1 Intersection (org.apache.datasketches.theta.Intersection)1 Union (org.apache.datasketches.tuple.Union)1 SketchHolder (org.apache.druid.query.aggregation.datasketches.theta.SketchHolder)1 InitializedNullHandlingTest (org.apache.druid.testing.InitializedNullHandlingTest)1