use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.
the class JaccardSimilarityTest method checkSimilarity2.
/**
* Enable printing on this test and you will see that the distribution is pretty tight,
* about +/- 0.7%, which is pretty good since the accuracy of the underlying sketch is about
* +/- 1.56%.
*/
@Test
public void checkSimilarity2() {
// tuple, theta
int minK = 1 << 12;
int u1 = 1 << 20;
int u2 = (int) (u1 * 0.95);
double threshold = 0.943;
println("Estimation Mode, minK: " + minK + "\t Th: " + threshold);
final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(minK).build();
final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build();
for (int i = 0; i < u1; i++) {
expected.update(i);
}
for (int i = 0; i < u2; i++) {
measured.update(i, constSummary);
}
double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso);
boolean state = similarityTest(measured, expected, factory.newSummary(), dsso, threshold);
println(state + "\t" + jaccardString(jResults));
assertTrue(state);
// check identity case
state = similarityTest(measured, measured, dsso, threshold);
assertTrue(state);
}
use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.
the class JaccardSimilarityTest method checkNullsEmpties2.
@Test
public void checkNullsEmpties2() {
// tuple, theta
int minK = 1 << 12;
double threshold = 0.95;
println("Check nulls & empties, minK: " + minK + "\t Th: " + threshold);
// check both null
double[] jResults = jaccard(null, null, factory.newSummary(), dsso);
boolean state = jResults[1] > threshold;
println("null \t null:\t" + state + "\t" + jaccardString(jResults));
assertFalse(state);
state = exactlyEqual(null, null, factory.newSummary(), dsso);
assertFalse(state);
final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(minK).build();
final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build();
// check both empty
jResults = jaccard(measured, expected, factory.newSummary(), dsso);
state = jResults[1] > threshold;
println("empty\tempty:\t" + state + "\t" + jaccardString(jResults));
assertTrue(state);
state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
assertTrue(state);
state = exactlyEqual(measured, measured, dsso);
assertTrue(state);
// adjust one
expected.update(1);
jResults = jaccard(measured, expected, factory.newSummary(), dsso);
state = jResults[1] > threshold;
println("empty\t 1:\t" + state + "\t" + jaccardString(jResults));
assertFalse(state);
state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
assertFalse(state);
println("");
}
use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.
the class JaccardSimilarityTest method checkDissimilarity2.
/**
* Enable printing on this test and you will see that the distribution is much looser,
* about +/- 14%. This is due to the fact that intersections loose accuracy as the ratio of
* intersection to the union becomes a small number.
*/
@Test
public void checkDissimilarity2() {
// tuple, theta
int minK = 1 << 12;
int u1 = 1 << 20;
int u2 = (int) (u1 * 0.05);
double threshold = 0.061;
println("Estimation Mode, minK: " + minK + "\t Th: " + threshold);
final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(minK).setNominalEntries(minK).build();
final UpdateSketch expected = thetaBldr.setNominalEntries(minK).build();
for (int i = 0; i < u1; i++) {
expected.update(i);
}
for (int i = 0; i < u2; i++) {
measured.update(i, constSummary);
}
double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso);
boolean state = dissimilarityTest(measured, expected, factory.newSummary(), dsso, threshold);
println(state + "\t" + jaccardString(jResults));
assertTrue(state);
}
use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.
the class JaccardSimilarityTest method checkEstMode2.
@Test
public void checkEstMode2() {
// tuple, theta
int k = 1 << 12;
int u = 1 << 20;
double threshold = 0.9999;
println("Estimation Mode, minK: " + k + "\t Th: " + threshold);
final UpdatableSketch<Double, DoubleSummary> measured = tupleBldr.setNominalEntries(k).build();
final UpdateSketch expected = thetaBldr.setNominalEntries(k).build();
for (int i = 0; i < u; i++) {
measured.update(i, constSummary);
expected.update(i);
}
double[] jResults = jaccard(measured, expected, factory.newSummary(), dsso);
boolean state = jResults[1] > threshold;
println(state + "\t" + jaccardString(jResults));
assertTrue(state);
state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
assertTrue(state);
for (int i = u; i < (u + 50); i++) {
// empirically determined
measured.update(i, constSummary);
}
jResults = jaccard(measured, expected, factory.newSummary(), dsso);
state = jResults[1] >= threshold;
println(state + "\t" + jaccardString(jResults));
assertFalse(state);
state = exactlyEqual(measured, expected, factory.newSummary(), dsso);
assertFalse(state);
println("");
}
use of org.apache.datasketches.tuple.adouble.DoubleSummary in project sketches-core by DataSketches.
the class TupleExamples2Test method example5.
@Test
public void example5() {
// stateful, tuple, theta, Mode=sum for both, use dsso1
// Load source sketches
final UpdatableSketch<Double, DoubleSummary> tupleSk = tupleBldr.build();
final UpdateSketch thetaSk = thetaBldr.build();
for (int i = 1; i <= 12; i++) {
tupleSk.update(i, 1.0);
thetaSk.update(i + 3);
}
// Union
final Union<DoubleSummary> union = new Union<>(dsso1);
union.union(tupleSk);
union.union(thetaSk, ufactory.newSummary().update(1.0));
final CompactSketch<DoubleSummary> ucsk = union.getResult();
int entries = ucsk.getRetainedEntries();
println("Union Stateful: tuple, theta: " + entries);
final SketchIterator<DoubleSummary> uiter = ucsk.iterator();
int counter = 1;
int twos = 0;
int ones = 0;
while (uiter.next()) {
final int i = (int) uiter.getSummary().getValue();
// 9 entries = 2, 6 entries = 1
println(counter++ + ", " + i);
if (i == 1) {
ones++;
}
if (i == 2) {
twos++;
}
}
assertEquals(ones, 6);
assertEquals(twos, 9);
// Intersection
final Intersection<DoubleSummary> inter = new Intersection<>(dsso1);
inter.intersect(tupleSk);
inter.intersect(thetaSk, ifactory.newSummary().update(1.0));
final CompactSketch<DoubleSummary> icsk = inter.getResult();
entries = icsk.getRetainedEntries();
println("Intersection Stateful: tuple, theta: " + entries);
final SketchIterator<DoubleSummary> iiter = icsk.iterator();
counter = 1;
while (iiter.next()) {
final int i = (int) iiter.getSummary().getValue();
// 9 entries = 1
println(counter++ + ", " + i);
assertEquals(i, 2);
}
}
Aggregations