use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToItemsSketch method exec.
// @formatter:off
/**
* Top-level exec function.
* This method accepts an input Tuple containing a Bag of one or more inner <b>Datum Tuples</b>
* and returns a single <b>Sketch</b> as a <b>Sketch Tuple</b>.
*
* <p>If a large number of calls is anticipated, leveraging either the <i>Algebraic</i> or
* <i>Accumulator</i> interfaces is recommended. Pig normally handles this automatically.
*
* <p>Internally, this method presents the inner <b>Datum Tuples</b> to a new <b>Union</b>,
* which is returned as a <b>Sketch Tuple</b>
*
* <p>Types below are in the form: Java data type: Pig DataType
*
* <p><b>Input Tuple</b>
* <ul>
* <li>Tuple: TUPLE (Must contain only one field)
* <ul>
* <li>index 0: DataBag: BAG (May contain 0 or more Inner Tuples)
* <ul>
* <li>index 0: Tuple: TUPLE <b>Datum Tuple</b></li>
* <li>...</li>
* <li>index n-1: Tuple: TUPLE <b>Datum Tuple</b></li>
* </ul>
* </li>
* </ul>
* </li>
* </ul>
*
* <b>Datum Tuple</b>
* <ul>
* <li>Tuple: TUPLE (Must contain only one field)
* <ul>
* <li>index 0: T: some suitable Pig type convertible to T</li>
* </ul>
* </li>
* </ul>
*
* <b>Sketch Tuple</b>
* <ul>
* <li>Tuple: TUPLE (Contains exactly 1 field)
* <ul>
* <li>index 0: DataByteArray: BYTEARRAY = a serialized QuantilesSketch object.</li>
* </ul>
* </li>
* </ul>
*
* @param inputTuple A tuple containing a single bag, containing Datum Tuples.
* @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch.
* @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)"
* @throws IOException from Pig.
*/
// @formatter:on
// TOP LEVEL EXEC
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
// The exec is a stateless function. It operates on the input and returns a result.
if (inputTuple != null && inputTuple.size() > 0) {
final ItemsUnion<T> union = k_ > 0 ? ItemsUnion.getInstance(k_, comparator_) : ItemsUnion.getInstance(comparator_);
final DataBag bag = (DataBag) inputTuple.get(0);
for (final Tuple innerTuple : bag) {
union.update(extractValue(innerTuple.get(0)));
}
final ItemsSketch<T> resultSketch = union.getResultAndReset();
if (resultSketch != null) {
return tupleFactory_.newTuple(new DataByteArray(resultSketch.toByteArray(serDe_)));
}
}
// return empty sketch
final ItemsSketch<T> sketch = k_ > 0 ? ItemsSketch.getInstance(k_, comparator_) : ItemsSketch.getInstance(comparator_);
return tupleFactory_.newTuple(new DataByteArray(sketch.toByteArray(serDe_)));
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class FrequentStringsSketchToEstimates method exec.
@Override
public DataBag exec(final Tuple input) throws IOException {
if ((input == null) || (input.size() == 0)) {
return null;
}
final DataByteArray dba = (DataByteArray) input.get(0);
final ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(dba.get()), new ArrayOfStringsSerDe());
final ItemsSketch.Row<String>[] result = sketch.getFrequentItems(errorType);
final DataBag bag = BagFactory.getInstance().newDefaultBag();
for (int i = 0; i < result.length; i++) {
final Tuple tuple = TupleFactory.getInstance().newTuple(4);
tuple.set(0, result[i].getItem());
tuple.set(1, result[i].getEstimate());
tuple.set(2, result[i].getLowerBound());
tuple.set(3, result[i].getUpperBound());
bag.add(tuple);
}
return bag;
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToFrequentItemsSketch method accumulate.
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
if (isFirstCall_) {
// this is to see in the log which way was used by Pig
Logger.getLogger(getClass()).info("accumulate was used");
isFirstCall_ = false;
}
if (accumSketch_ == null) {
accumSketch_ = new ItemsSketch<T>(sketchSize_);
}
if (inputTuple.size() != 1) {
throw new IllegalArgumentException("Input tuple must have 1 bag");
}
final DataBag bag = (DataBag) inputTuple.get(0);
updateSketch(bag, accumSketch_);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToFrequentItemsSketchAlgebraicIntermediateFinal method exec.
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
if (isFirstCall_) {
// this is to see in the log which way was used by Pig
Logger.getLogger(getClass()).info("algebraic was used");
isFirstCall_ = false;
}
final ItemsSketch<T> sketch = new ItemsSketch<T>(sketchSize_);
final DataBag bag = (DataBag) inputTuple.get(0);
for (Tuple dataTuple : bag) {
final Object item = dataTuple.get(0);
if (item instanceof DataBag) {
// this is a bag from the Initial function.
// just insert each item of the tuple into the sketch
DataToFrequentItemsSketch.updateSketch((DataBag) item, sketch);
} else if (item instanceof DataByteArray) {
// This is a sketch from a prior call to the
// Intermediate function. merge it with the
// current sketch.
final ItemsSketch<T> incomingSketch = Util.deserializeSketchFromTuple(dataTuple, serDe_);
sketch.merge(incomingSketch);
} else {
// we should never get here.
throw new IllegalArgumentException("InputTuple.Field0: Bag contains unrecognized types: " + item.getClass().getName());
}
}
return Util.serializeSketchToTuple(sketch, serDe_);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class ReservoirSamplingTest method execTest.
@Test
public void execTest() throws IOException {
// copies tests for accumulate() since that handles both data paths
final int k = 32;
final long n = 24;
final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
final TupleFactory tf = TupleFactory.getInstance();
for (long i = 0; i < n; ++i) {
final Tuple t = tf.newTuple(2);
t.set(0, i);
t.set(1, Long.toString(-i));
inputBag.add(t);
}
final Tuple input = tf.newTuple(inputBag);
final ReservoirSampling rs = new ReservoirSampling(Integer.toString(k));
Tuple result = rs.exec(input);
assertEquals(result.size(), 3, "Incorrect output size");
assertEquals(result.get(0), n, "Incorrect number of samples seen");
assertEquals(result.get(1), k, "Incorrect value of k");
assertEquals(((DataBag) result.get(2)).size(), n);
// add another n to the bag and repeat
for (long i = n; i < 2 * n; ++i) {
final Tuple t = tf.newTuple(2);
t.set(0, i);
t.set(1, Long.toString(-i));
inputBag.add(t);
}
result = rs.exec(input);
assertEquals(result.get(0), 2 * n, "Incorrect number of samples seen");
// unchanged
assertEquals(result.get(1), k, "Incorrect value of k");
assertEquals(((DataBag) result.get(2)).size(), Math.min(k, 2 * n));
}
Aggregations