Search in sources :

Example 11 with ArrayOfStringsSerDe

use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.

the class GetQuantileFromStringsSketch method exec.

@Override
public String exec(final Tuple input) throws IOException {
    if (input.size() != 2) {
        throw new IllegalArgumentException("expected two inputs: sketch and fraction");
    }
    if (!(input.get(0) instanceof DataByteArray)) {
        throw new IllegalArgumentException("expected a DataByteArray as a sketch, got " + input.get(0).getClass().getSimpleName());
    }
    final DataByteArray dba = (DataByteArray) input.get(0);
    final ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(dba.get()), Comparator.naturalOrder(), new ArrayOfStringsSerDe());
    if (!(input.get(1) instanceof Double)) {
        throw new IllegalArgumentException("expected a double value as a fraction, got " + input.get(1).getClass().getSimpleName());
    }
    final double fraction = (double) input.get(1);
    return sketch.getQuantile(fraction);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataByteArray(org.apache.pig.data.DataByteArray)

Example 12 with ArrayOfStringsSerDe

use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.

the class FrequentStringsSketchToEstimates method exec.

@Override
public DataBag exec(final Tuple input) throws IOException {
    if ((input == null) || (input.size() == 0)) {
        return null;
    }
    final DataByteArray dba = (DataByteArray) input.get(0);
    final ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(dba.get()), new ArrayOfStringsSerDe());
    final ItemsSketch.Row<String>[] result = sketch.getFrequentItems(errorType);
    final DataBag bag = BagFactory.getInstance().newDefaultBag();
    for (int i = 0; i < result.length; i++) {
        final Tuple tuple = TupleFactory.getInstance().newTuple(4);
        tuple.set(0, result[i].getItem());
        tuple.set(1, result[i].getEstimate());
        tuple.set(2, result[i].getLowerBound());
        tuple.set(3, result[i].getUpperBound());
        bag.add(tuple);
    }
    return bag;
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple)

Example 13 with ArrayOfStringsSerDe

use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.

the class DataToFrequentStringsSketchTest method exec.

@Test
public void exec() throws Exception {
    EvalFunc<Tuple> func = new DataToFrequentStringsSketch("8");
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    bag.add(PigUtil.objectsToTuple("a"));
    bag.add(PigUtil.objectsToTuple("b", 5L));
    bag.add(PigUtil.objectsToTuple("a", 2L));
    bag.add(PigUtil.objectsToTuple("b"));
    Tuple inputTuple = PigUtil.objectsToTuple(bag);
    Tuple resultTuple = func.exec(inputTuple);
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    DataByteArray bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertEquals(sketch.getNumActiveItems(), 2);
    Assert.assertEquals(sketch.getEstimate("a"), 3);
    Assert.assertEquals(sketch.getEstimate("b"), 6);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 14 with ArrayOfStringsSerDe

use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.

the class DataToFrequentStringsSketchTest method accumulator.

@Test
public void accumulator() throws Exception {
    Accumulator<Tuple> func = new DataToFrequentStringsSketch("8");
    Tuple inputTuple = TupleFactory.getInstance().newTuple(1);
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    bag.add(PigUtil.objectsToTuple("a"));
    inputTuple.set(0, bag);
    func.accumulate(inputTuple);
    inputTuple = TupleFactory.getInstance().newTuple(1);
    bag = BagFactory.getInstance().newDefaultBag();
    bag.add(PigUtil.objectsToTuple("b"));
    bag.add(PigUtil.objectsToTuple("a", 2L));
    bag.add(PigUtil.objectsToTuple("b", 5L));
    inputTuple.set(0, bag);
    func.accumulate(inputTuple);
    Tuple resultTuple = func.getValue();
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    DataByteArray bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertEquals(sketch.getNumActiveItems(), 2);
    Assert.assertEquals(sketch.getEstimate("a"), 3);
    Assert.assertEquals(sketch.getEstimate("b"), 6);
    // after cleanup, the value should always be 0
    func.cleanup();
    resultTuple = func.getValue();
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch2 = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertTrue(sketch2.isEmpty());
    Assert.assertEquals(sketch2.getNumActiveItems(), 0);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 15 with ArrayOfStringsSerDe

use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.

the class FrequentStringsSketchToEstimatesTest method estimation.

@Test
public void estimation() throws Exception {
    ItemsSketch<String> sketch = new ItemsSketch<String>(8);
    sketch.update("1", 1000);
    sketch.update("2", 500);
    sketch.update("3", 200);
    sketch.update("4", 100);
    sketch.update("5", 50);
    sketch.update("6", 20);
    sketch.update("7", 10);
    sketch.update("8", 5);
    sketch.update("9", 2);
    sketch.update("10");
    Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
    EvalFunc<DataBag> func1 = new FrequentStringsSketchToEstimates("NO_FALSE_POSITIVES");
    DataBag bag1 = func1.exec(inputTuple);
    Assert.assertNotNull(bag1);
    Assert.assertTrue(bag1.size() < 10);
    EvalFunc<DataBag> func2 = new FrequentStringsSketchToEstimates("NO_FALSE_NEGATIVES");
    DataBag bag2 = func2.exec(inputTuple);
    Assert.assertNotNull(bag2);
    Assert.assertTrue(bag2.size() < 10);
    Assert.assertTrue(bag1.size() < bag2.size());
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Aggregations

ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)43 Test (org.testng.annotations.Test)37 DataByteArray (org.apache.pig.data.DataByteArray)23 Tuple (org.apache.pig.data.Tuple)19 WritableMemory (com.yahoo.memory.WritableMemory)15 DataBag (org.apache.pig.data.DataBag)12 Memory (com.yahoo.memory.Memory)10 ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)9 SketchesArgumentException (com.yahoo.sketches.SketchesArgumentException)1