use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.
the class DataToFrequentItemsSketchAlgebraicIntermediateFinal method exec.
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
if (isFirstCall_) {
// this is to see in the log which way was used by Pig
Logger.getLogger(getClass()).info("algebraic was used");
isFirstCall_ = false;
}
final ItemsSketch<T> sketch = new ItemsSketch<T>(sketchSize_);
final DataBag bag = (DataBag) inputTuple.get(0);
for (Tuple dataTuple : bag) {
final Object item = dataTuple.get(0);
if (item instanceof DataBag) {
// this is a bag from the Initial function.
// just insert each item of the tuple into the sketch
DataToFrequentItemsSketch.updateSketch((DataBag) item, sketch);
} else if (item instanceof DataByteArray) {
// This is a sketch from a prior call to the
// Intermediate function. merge it with the
// current sketch.
final ItemsSketch<T> incomingSketch = Util.deserializeSketchFromTuple(dataTuple, serDe_);
sketch.merge(incomingSketch);
} else {
// we should never get here.
throw new IllegalArgumentException("InputTuple.Field0: Bag contains unrecognized types: " + item.getClass().getName());
}
}
return Util.serializeSketchToTuple(sketch, serDe_);
}
use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.
the class FrequentStringsSketchToEstimatesTest method estimation.
@Test
public void estimation() throws Exception {
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
sketch.update("1", 1000);
sketch.update("2", 500);
sketch.update("3", 200);
sketch.update("4", 100);
sketch.update("5", 50);
sketch.update("6", 20);
sketch.update("7", 10);
sketch.update("8", 5);
sketch.update("9", 2);
sketch.update("10");
Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
EvalFunc<DataBag> func1 = new FrequentStringsSketchToEstimates("NO_FALSE_POSITIVES");
DataBag bag1 = func1.exec(inputTuple);
Assert.assertNotNull(bag1);
Assert.assertTrue(bag1.size() < 10);
EvalFunc<DataBag> func2 = new FrequentStringsSketchToEstimates("NO_FALSE_NEGATIVES");
DataBag bag2 = func2.exec(inputTuple);
Assert.assertNotNull(bag2);
Assert.assertTrue(bag2.size() < 10);
Assert.assertTrue(bag1.size() < bag2.size());
}
use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.
the class FrequentStringsSketchToEstimatesTest method exact.
@Test
public void exact() throws Exception {
EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
sketch.update("a");
sketch.update("a");
sketch.update("b");
Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
DataBag bag = func.exec(inputTuple);
Assert.assertNotNull(bag);
Assert.assertEquals(bag.size(), 2);
Iterator<Tuple> it = bag.iterator();
Tuple tuple1 = it.next();
Assert.assertEquals(tuple1.size(), 4);
Assert.assertEquals((String) tuple1.get(0), "a");
Assert.assertEquals((long) tuple1.get(1), 2L);
Assert.assertEquals((long) tuple1.get(2), 2L);
Assert.assertEquals((long) tuple1.get(3), 2L);
Tuple tuple2 = it.next();
Assert.assertEquals(tuple2.size(), 4);
Assert.assertEquals((String) tuple2.get(0), "b");
Assert.assertEquals((long) tuple2.get(1), 1L);
Assert.assertEquals((long) tuple2.get(2), 1L);
Assert.assertEquals((long) tuple2.get(3), 1L);
}
use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.
the class UnionFrequentStringsSketchTest method accumulator.
@Test
public void accumulator() throws Exception {
Accumulator<Tuple> func = new UnionFrequentStringsSketch("8");
DataBag bag = BagFactory.getInstance().newDefaultBag();
{
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
sketch.update("a");
sketch.update("b");
bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
}
func.accumulate(PigUtil.objectsToTuple(bag));
bag = BagFactory.getInstance().newDefaultBag();
{
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
sketch.update("a");
sketch.update("b");
bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
}
func.accumulate(PigUtil.objectsToTuple(bag));
Tuple resultTuple = func.getValue();
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
DataByteArray bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertFalse(sketch.isEmpty());
Assert.assertEquals(sketch.getNumActiveItems(), 2);
Assert.assertEquals(sketch.getEstimate("a"), 2);
Assert.assertEquals(sketch.getEstimate("b"), 2);
}
use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.
the class UnionFrequentStringsSketchTest method accumulatorEmptySketch.
@Test
public void accumulatorEmptySketch() throws Exception {
Accumulator<Tuple> func = new UnionFrequentStringsSketch("8");
DataBag bag = BagFactory.getInstance().newDefaultBag();
{
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
}
func.accumulate(PigUtil.objectsToTuple(bag));
Tuple resultTuple = func.getValue();
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
DataByteArray bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertTrue(sketch.isEmpty());
Assert.assertEquals(sketch.getNumActiveItems(), 0);
}
Aggregations