use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.
the class GetQuantileFromStringsSketch method exec.
@Override
public String exec(final Tuple input) throws IOException {
if (input.size() != 2) {
throw new IllegalArgumentException("expected two inputs: sketch and fraction");
}
if (!(input.get(0) instanceof DataByteArray)) {
throw new IllegalArgumentException("expected a DataByteArray as a sketch, got " + input.get(0).getClass().getSimpleName());
}
final DataByteArray dba = (DataByteArray) input.get(0);
final ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(dba.get()), Comparator.naturalOrder(), new ArrayOfStringsSerDe());
if (!(input.get(1) instanceof Double)) {
throw new IllegalArgumentException("expected a double value as a fraction, got " + input.get(1).getClass().getSimpleName());
}
final double fraction = (double) input.get(1);
return sketch.getQuantile(fraction);
}
use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.
the class FrequentStringsSketchToEstimates method exec.
@Override
public DataBag exec(final Tuple input) throws IOException {
if ((input == null) || (input.size() == 0)) {
return null;
}
final DataByteArray dba = (DataByteArray) input.get(0);
final ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(dba.get()), new ArrayOfStringsSerDe());
final ItemsSketch.Row<String>[] result = sketch.getFrequentItems(errorType);
final DataBag bag = BagFactory.getInstance().newDefaultBag();
for (int i = 0; i < result.length; i++) {
final Tuple tuple = TupleFactory.getInstance().newTuple(4);
tuple.set(0, result[i].getItem());
tuple.set(1, result[i].getEstimate());
tuple.set(2, result[i].getLowerBound());
tuple.set(3, result[i].getUpperBound());
bag.add(tuple);
}
return bag;
}
use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.
the class DataToFrequentStringsSketchTest method exec.
@Test
public void exec() throws Exception {
EvalFunc<Tuple> func = new DataToFrequentStringsSketch("8");
DataBag bag = BagFactory.getInstance().newDefaultBag();
bag.add(PigUtil.objectsToTuple("a"));
bag.add(PigUtil.objectsToTuple("b", 5L));
bag.add(PigUtil.objectsToTuple("a", 2L));
bag.add(PigUtil.objectsToTuple("b"));
Tuple inputTuple = PigUtil.objectsToTuple(bag);
Tuple resultTuple = func.exec(inputTuple);
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
DataByteArray bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertEquals(sketch.getNumActiveItems(), 2);
Assert.assertEquals(sketch.getEstimate("a"), 3);
Assert.assertEquals(sketch.getEstimate("b"), 6);
}
use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.
the class DataToFrequentStringsSketchTest method accumulator.
@Test
public void accumulator() throws Exception {
Accumulator<Tuple> func = new DataToFrequentStringsSketch("8");
Tuple inputTuple = TupleFactory.getInstance().newTuple(1);
DataBag bag = BagFactory.getInstance().newDefaultBag();
bag.add(PigUtil.objectsToTuple("a"));
inputTuple.set(0, bag);
func.accumulate(inputTuple);
inputTuple = TupleFactory.getInstance().newTuple(1);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(PigUtil.objectsToTuple("b"));
bag.add(PigUtil.objectsToTuple("a", 2L));
bag.add(PigUtil.objectsToTuple("b", 5L));
inputTuple.set(0, bag);
func.accumulate(inputTuple);
Tuple resultTuple = func.getValue();
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
DataByteArray bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertEquals(sketch.getNumActiveItems(), 2);
Assert.assertEquals(sketch.getEstimate("a"), 3);
Assert.assertEquals(sketch.getEstimate("b"), 6);
// after cleanup, the value should always be 0
func.cleanup();
resultTuple = func.getValue();
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch2 = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertTrue(sketch2.isEmpty());
Assert.assertEquals(sketch2.getNumActiveItems(), 0);
}
use of com.yahoo.sketches.ArrayOfStringsSerDe in project sketches-pig by DataSketches.
the class FrequentStringsSketchToEstimatesTest method estimation.
@Test
public void estimation() throws Exception {
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
sketch.update("1", 1000);
sketch.update("2", 500);
sketch.update("3", 200);
sketch.update("4", 100);
sketch.update("5", 50);
sketch.update("6", 20);
sketch.update("7", 10);
sketch.update("8", 5);
sketch.update("9", 2);
sketch.update("10");
Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
EvalFunc<DataBag> func1 = new FrequentStringsSketchToEstimates("NO_FALSE_POSITIVES");
DataBag bag1 = func1.exec(inputTuple);
Assert.assertNotNull(bag1);
Assert.assertTrue(bag1.size() < 10);
EvalFunc<DataBag> func2 = new FrequentStringsSketchToEstimates("NO_FALSE_NEGATIVES");
DataBag bag2 = func2.exec(inputTuple);
Assert.assertNotNull(bag2);
Assert.assertTrue(bag2.size() < 10);
Assert.assertTrue(bag1.size() < bag2.size());
}
Aggregations