use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToSketch method exec.
// @formatter:off
/**
***********************************************************************************************
* Top-level exec function.
* This method accepts an input Tuple containing a Bag of one or more inner <b>Datum Tuples</b>
* and returns a single updated <b>Sketch</b> as a <b>Sketch Tuple</b>.
*
* <p>If a large number of calls is anticipated, leveraging either the <i>Algebraic</i> or
* <i>Accumulator</i> interfaces is recommended. Pig normally handles this automatically.
*
* <p>Internally, this method presents the inner <b>Datum Tuples</b> to a new <b>Sketch</b>,
* which is returned as a <b>Sketch Tuple</b>
*
* <p><b>Input Tuple</b>
* <ul>
* <li>Tuple: TUPLE (Must contain only one field)
* <ul>
* <li>index 0: DataBag: BAG (May contain 0 or more Inner Tuples)
* <ul>
* <li>index 0: Tuple: TUPLE <b>Datum Tuple</b></li>
* <li>...</li>
* <li>index n-1: Tuple: TUPLE <b>Datum Tuple</b></li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
*
* <b>Datum Tuple</b>
* <ul>
* <li>Tuple: TUPLE (Must contain only one field)
* <ul>
* <li>index 0: Java data type : Pig DataType: may be any one of:
* <ul>
* <li>Byte: BYTE</li>
* <li>Integer: INTEGER</li>
* <li>Long: LONG</li>
* <li>Float: FLOAT</li>
* <li>Double: DOUBLE</li>
* <li>String: CHARARRAY</li>
* <li>DataByteArray: BYTEARRAY</li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
*
* <b>Sketch Tuple</b>
* <ul>
* <li>Tuple: TUPLE (Contains exactly 1 field)
* <ul>
* <li>index 0: DataByteArray: BYTEARRAY = The serialization of a Sketch object.</li>
* </ul>
* </li>
* </ul>
*
* @param inputTuple A tuple containing a single bag, containing Datum Tuples.
* @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch (8 bytes).
* @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)"
* @throws IOException from Pig.
*/
// @formatter:on
// TOP LEVEL EXEC
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
// throws is in API
// The exec is a stateless function. It operates on the input and returns a result.
// It can only call static functions.
final Union union = newUnion(nomEntries_, p_, seed_);
final DataBag bag = extractBag(inputTuple);
if (bag == null) {
// Configured with parent
return emptyCompactOrderedSketchTuple_;
}
// updates union with all elements of the bag
updateUnion(bag, union);
final CompactSketch compOrdSketch = union.getResult(true, null);
return compactOrderedSketchToTuple(compOrdSketch);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToFrequentStringsSketchTest method algebraicIntermediateFinalWrongType.
@Test(expectedExceptions = IllegalArgumentException.class)
public void algebraicIntermediateFinalWrongType() throws Exception {
EvalFunc<Tuple> func = new DataToFrequentStringsSketch.IntermediateFinal("8");
DataBag bag = BagFactory.getInstance().newDefaultBag();
// this bag must have tuples with either bags or data byte arrays
bag.add(TupleFactory.getInstance().newTuple(1.0));
func.exec(TupleFactory.getInstance().newTuple(bag));
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToFrequentStringsSketchTest method exec.
@Test
public void exec() throws Exception {
EvalFunc<Tuple> func = new DataToFrequentStringsSketch("8");
DataBag bag = BagFactory.getInstance().newDefaultBag();
bag.add(PigUtil.objectsToTuple("a"));
bag.add(PigUtil.objectsToTuple("b", 5L));
bag.add(PigUtil.objectsToTuple("a", 2L));
bag.add(PigUtil.objectsToTuple("b"));
Tuple inputTuple = PigUtil.objectsToTuple(bag);
Tuple resultTuple = func.exec(inputTuple);
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
DataByteArray bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertEquals(sketch.getNumActiveItems(), 2);
Assert.assertEquals(sketch.getEstimate("a"), 3);
Assert.assertEquals(sketch.getEstimate("b"), 6);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToFrequentStringsSketchTest method execWrongSizeOfInnerTuple.
@Test(expectedExceptions = IllegalArgumentException.class)
public void execWrongSizeOfInnerTuple() throws Exception {
EvalFunc<Tuple> func = new DataToFrequentStringsSketch("8");
DataBag bag = BagFactory.getInstance().newDefaultBag();
bag.add(PigUtil.objectsToTuple());
Tuple inputTuple = PigUtil.objectsToTuple(bag);
func.exec(inputTuple);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToFrequentStringsSketchTest method accumulator.
@Test
public void accumulator() throws Exception {
Accumulator<Tuple> func = new DataToFrequentStringsSketch("8");
Tuple inputTuple = TupleFactory.getInstance().newTuple(1);
DataBag bag = BagFactory.getInstance().newDefaultBag();
bag.add(PigUtil.objectsToTuple("a"));
inputTuple.set(0, bag);
func.accumulate(inputTuple);
inputTuple = TupleFactory.getInstance().newTuple(1);
bag = BagFactory.getInstance().newDefaultBag();
bag.add(PigUtil.objectsToTuple("b"));
bag.add(PigUtil.objectsToTuple("a", 2L));
bag.add(PigUtil.objectsToTuple("b", 5L));
inputTuple.set(0, bag);
func.accumulate(inputTuple);
Tuple resultTuple = func.getValue();
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
DataByteArray bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertEquals(sketch.getNumActiveItems(), 2);
Assert.assertEquals(sketch.getEstimate("a"), 3);
Assert.assertEquals(sketch.getEstimate("b"), 6);
// after cleanup, the value should always be 0
func.cleanup();
resultTuple = func.getValue();
Assert.assertNotNull(resultTuple);
Assert.assertEquals(resultTuple.size(), 1);
bytes = (DataByteArray) resultTuple.get(0);
Assert.assertTrue(bytes.size() > 0);
ItemsSketch<String> sketch2 = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
Assert.assertTrue(sketch2.isEmpty());
Assert.assertEquals(sketch2.getNumActiveItems(), 0);
}
Aggregations