use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class FrequentStringsSketchToEstimatesTest method exact.
@Test
public void exact() throws Exception {
EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
sketch.update("a");
sketch.update("a");
sketch.update("b");
Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
DataBag bag = func.exec(inputTuple);
Assert.assertNotNull(bag);
Assert.assertEquals(bag.size(), 2);
Iterator<Tuple> it = bag.iterator();
Tuple tuple1 = it.next();
Assert.assertEquals(tuple1.size(), 4);
Assert.assertEquals((String) tuple1.get(0), "a");
Assert.assertEquals((long) tuple1.get(1), 2L);
Assert.assertEquals((long) tuple1.get(2), 2L);
Assert.assertEquals((long) tuple1.get(3), 2L);
Tuple tuple2 = it.next();
Assert.assertEquals(tuple2.size(), 4);
Assert.assertEquals((String) tuple2.get(0), "b");
Assert.assertEquals((long) tuple2.get(1), 1L);
Assert.assertEquals((long) tuple2.get(2), 1L);
Assert.assertEquals((long) tuple2.get(3), 1L);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class UnionFrequentStringsSketchTest method accumulator.
@Test
public void accumulator() throws Exception {
Accumulator<Tuple> func = new UnionFrequentStringsSketch("8");
DataBag bag = BagFactory.getInstance().newDefaultBag();
{
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
sketch.update("a");
sketch.update("b");
bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
}
func.accumulate(PigUtil.objectsToTuple(bag));
bag = BagFactory.getInstance().newDefaultBag();
{
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
sketch.update("a");
sketch.update("b");
bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
}
func.accumulate(PigUtil.objectsToTuple(bag));
Tuple resultTuple = func.getValue();
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
DataByteArray bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertFalse(sketch.isEmpty());
Assert.assertEquals(sketch.getNumActiveItems(), 2);
Assert.assertEquals(sketch.getEstimate("a"), 2);
Assert.assertEquals(sketch.getEstimate("b"), 2);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class UnionFrequentStringsSketchTest method accumulatorEmptySketch.
@Test
public void accumulatorEmptySketch() throws Exception {
Accumulator<Tuple> func = new UnionFrequentStringsSketch("8");
DataBag bag = BagFactory.getInstance().newDefaultBag();
{
ItemsSketch<String> sketch = new ItemsSketch<String>(8);
bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
}
func.accumulate(PigUtil.objectsToTuple(bag));
Tuple resultTuple = func.getValue();
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
DataByteArray bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertTrue(sketch.isEmpty());
Assert.assertEquals(sketch.getNumActiveItems(), 0);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class Union method accumulate.
// ACCUMULATOR INTERFACE
/**
***********************************************************************************************
* An <i>Accumulator</i> version of the standard <i>exec()</i> method. Like <i>exec()</i>,
* accumulator is called with a bag of Sketch Tuples. Unlike <i>exec()</i>, it doesn't serialize the
* sketch at the end. Instead, it can be called multiple times, each time with another bag of
* Sketch Tuples to be input to the Union.
*
* @param inputTuple A tuple containing a single bag, containing Sketch Tuples.
* @see #exec
* @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)"
* @throws IOException by Pig
*/
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
// throws is in API
if (accumUnion_ == null) {
accumUnion_ = SetOperation.builder().setP(p_).setSeed(seed_).setResizeFactor(RF).setNominalEntries(nomEntries_).buildUnion();
}
final DataBag bag = extractBag(inputTuple);
if (bag == null) {
return;
}
updateUnion(bag, accumUnion_);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class Union method exec.
// @formatter:off
/**
**********************************************************************************************
* Top-level exec function.
* This method accepts an input Tuple containing a Bag of one or more inner <b>Sketch Tuples</b>
* and returns a single updated <b>Sketch</b> as a <b>Sketch Tuple</b>.
*
* <p>If a large number of calls are anticipated, leveraging either the <i>Algebraic</i> or
* <i>Accumulator</i> interfaces is recommended. Pig normally handles this automatically.
*
* <p>Internally, this method presents the inner <b>Sketch Tuples</b> to a new <b>Union</b>.
* The result is returned as a <b>Sketch Tuple</b>
*
* <p><b>Input Tuple</b>
* <ul>
* <li>Tuple: TUPLE (Must contain only one field)
* <ul>
* <li>index 0: DataBag: BAG (May contain 0 or more Inner Tuples)
* <ul>
* <li>index 0: Tuple: TUPLE <b>Sketch Tuple</b></li>
* <li>...</li>
* <li>index n-1: Tuple: TUPLE <b>Sketch Tuple</b></li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
*
* <b>Sketch Tuple</b>
* <ul>
* <li>Tuple: TUPLE (Contains exactly 1 field)
* <ul>
* <li>index 0: DataByteArray: BYTEARRAY = The serialization of a Sketch object.</li>
* </ul>
* </li>
* </ul>
*
* @param inputTuple A tuple containing a single bag, containing Sketch Tuples.
* @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch (8 bytes).
* @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)"
*/
// @formatter:on
// TOP LEVEL EXEC
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
// throws is in API
// The exec is a stateless function. It operates on the input and returns a result.
// It can only call static functions.
final com.yahoo.sketches.theta.Union union = SetOperation.builder().setP(p_).setSeed(seed_).setResizeFactor(RF).setNominalEntries(nomEntries_).buildUnion();
final DataBag bag = extractBag(inputTuple);
if (bag == null) {
// Configured with parent
return emptyCompactOrderedSketchTuple_;
}
updateUnion(bag, union);
final CompactSketch compactSketch = union.getResult(true, null);
return compactOrderedSketchToTuple(compactSketch);
}
Aggregations