Search in sources :

Example 1 with ItemsSketch

use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.

the class DataToFrequentItemsSketchAlgebraicIntermediateFinal method exec.

@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
    if (isFirstCall_) {
        // this is to see in the log which way was used by Pig
        Logger.getLogger(getClass()).info("algebraic was used");
        isFirstCall_ = false;
    }
    final ItemsSketch<T> sketch = new ItemsSketch<T>(sketchSize_);
    final DataBag bag = (DataBag) inputTuple.get(0);
    for (Tuple dataTuple : bag) {
        final Object item = dataTuple.get(0);
        if (item instanceof DataBag) {
            // this is a bag from the Initial function.
            // just insert each item of the tuple into the sketch
            DataToFrequentItemsSketch.updateSketch((DataBag) item, sketch);
        } else if (item instanceof DataByteArray) {
            // This is a sketch from a prior call to the
            // Intermediate function. merge it with the
            // current sketch.
            final ItemsSketch<T> incomingSketch = Util.deserializeSketchFromTuple(dataTuple, serDe_);
            sketch.merge(incomingSketch);
        } else {
            // we should never get here.
            throw new IllegalArgumentException("InputTuple.Field0: Bag contains unrecognized types: " + item.getClass().getName());
        }
    }
    return Util.serializeSketchToTuple(sketch, serDe_);
}
Also used : DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple)

Example 2 with ItemsSketch

use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.

the class FrequentStringsSketchToEstimatesTest method estimation.

@Test
public void estimation() throws Exception {
    ItemsSketch<String> sketch = new ItemsSketch<String>(8);
    sketch.update("1", 1000);
    sketch.update("2", 500);
    sketch.update("3", 200);
    sketch.update("4", 100);
    sketch.update("5", 50);
    sketch.update("6", 20);
    sketch.update("7", 10);
    sketch.update("8", 5);
    sketch.update("9", 2);
    sketch.update("10");
    Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
    EvalFunc<DataBag> func1 = new FrequentStringsSketchToEstimates("NO_FALSE_POSITIVES");
    DataBag bag1 = func1.exec(inputTuple);
    Assert.assertNotNull(bag1);
    Assert.assertTrue(bag1.size() < 10);
    EvalFunc<DataBag> func2 = new FrequentStringsSketchToEstimates("NO_FALSE_NEGATIVES");
    DataBag bag2 = func2.exec(inputTuple);
    Assert.assertNotNull(bag2);
    Assert.assertTrue(bag2.size() < 10);
    Assert.assertTrue(bag1.size() < bag2.size());
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 3 with ItemsSketch

use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.

the class FrequentStringsSketchToEstimatesTest method exact.

@Test
public void exact() throws Exception {
    EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
    ItemsSketch<String> sketch = new ItemsSketch<String>(8);
    sketch.update("a");
    sketch.update("a");
    sketch.update("b");
    Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
    DataBag bag = func.exec(inputTuple);
    Assert.assertNotNull(bag);
    Assert.assertEquals(bag.size(), 2);
    Iterator<Tuple> it = bag.iterator();
    Tuple tuple1 = it.next();
    Assert.assertEquals(tuple1.size(), 4);
    Assert.assertEquals((String) tuple1.get(0), "a");
    Assert.assertEquals((long) tuple1.get(1), 2L);
    Assert.assertEquals((long) tuple1.get(2), 2L);
    Assert.assertEquals((long) tuple1.get(3), 2L);
    Tuple tuple2 = it.next();
    Assert.assertEquals(tuple2.size(), 4);
    Assert.assertEquals((String) tuple2.get(0), "b");
    Assert.assertEquals((long) tuple2.get(1), 1L);
    Assert.assertEquals((long) tuple2.get(2), 1L);
    Assert.assertEquals((long) tuple2.get(3), 1L);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 4 with ItemsSketch

use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.

the class UnionFrequentStringsSketchTest method accumulator.

@Test
public void accumulator() throws Exception {
    Accumulator<Tuple> func = new UnionFrequentStringsSketch("8");
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    {
        ItemsSketch<String> sketch = new ItemsSketch<String>(8);
        sketch.update("a");
        sketch.update("b");
        bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
    }
    func.accumulate(PigUtil.objectsToTuple(bag));
    bag = BagFactory.getInstance().newDefaultBag();
    {
        ItemsSketch<String> sketch = new ItemsSketch<String>(8);
        sketch.update("a");
        sketch.update("b");
        bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
    }
    func.accumulate(PigUtil.objectsToTuple(bag));
    Tuple resultTuple = func.getValue();
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    DataByteArray bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertFalse(sketch.isEmpty());
    Assert.assertEquals(sketch.getNumActiveItems(), 2);
    Assert.assertEquals(sketch.getEstimate("a"), 2);
    Assert.assertEquals(sketch.getEstimate("b"), 2);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 5 with ItemsSketch

use of com.yahoo.sketches.frequencies.ItemsSketch in project sketches-pig by DataSketches.

the class UnionFrequentStringsSketchTest method accumulatorEmptySketch.

@Test
public void accumulatorEmptySketch() throws Exception {
    Accumulator<Tuple> func = new UnionFrequentStringsSketch("8");
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    {
        ItemsSketch<String> sketch = new ItemsSketch<String>(8);
        bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
    }
    func.accumulate(PigUtil.objectsToTuple(bag));
    Tuple resultTuple = func.getValue();
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    DataByteArray bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertTrue(sketch.isEmpty());
    Assert.assertEquals(sketch.getNumActiveItems(), 0);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Aggregations

ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)11 DataBag (org.apache.pig.data.DataBag)11 DataByteArray (org.apache.pig.data.DataByteArray)11 Tuple (org.apache.pig.data.Tuple)11 ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)9 Test (org.testng.annotations.Test)9