use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToStringsSketchTest method execNormalCase.
@Test
public void execNormalCase() throws Exception {
EvalFunc<Tuple> func = new DataToStringsSketch();
DataBag bag = BAG_FACTORY.newDefaultBag();
bag.add(TUPLE_FACTORY.newTuple("a"));
Tuple resultTuple = func.exec(TUPLE_FACTORY.newTuple(bag));
ItemsSketch<String> sketch = getSketch(resultTuple);
Assert.assertFalse(sketch.isEmpty());
Assert.assertEquals(sketch.getN(), 1);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class ReservoirUnion method accumulate.
// We could overload exec() for easy cases, but we still need to compare the incoming
// reservoir's k vs max k and possibly downsample.
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
if (inputTuple == null || inputTuple.size() < 1 || inputTuple.isNull(0)) {
return;
}
final DataBag reservoirs = (DataBag) inputTuple.get(0);
if (union_ == null) {
union_ = ReservoirItemsUnion.newInstance(maxK_);
}
try {
for (Tuple t : reservoirs) {
// if t == null or t.size() < 3, we'll throw an exception
final long n = (long) t.get(0);
final int k = (int) t.get(1);
final DataBag sampleBag = (DataBag) t.get(2);
final ArrayList<Tuple> samples = ReservoirSampling.dataBagToArrayList(sampleBag);
union_.update(n, k, samples);
}
} catch (final IndexOutOfBoundsException e) {
throw new ExecException("Cannot update union with given reservoir", e);
}
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class ReservoirUnion method getValue.
@Override
public Tuple getValue() {
if (union_ == null) {
return null;
}
// newDefaultBag(List<Tuple>) does *not* copy values
final ReservoirItemsSketch<Tuple> resultSketch = union_.getResult();
final List<Tuple> data = SamplingPigUtil.getRawSamplesAsList(resultSketch);
final DataBag sampleBag = BagFactory.getInstance().newDefaultBag(data);
return ReservoirSampling.createResultTuple(resultSketch.getN(), resultSketch.getK(), sampleBag);
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class VarOptUnion method accumulate.
// We could overload exec() for easy cases, but we still need to compare the incoming
// reservoir's k vs max k and possibly downsample.
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
if (inputTuple == null || inputTuple.size() < 1 || inputTuple.isNull(0)) {
return;
}
final DataBag sketches = (DataBag) inputTuple.get(0);
if (union_ == null) {
union_ = VarOptItemsUnion.newInstance(maxK_);
}
for (Tuple t : sketches) {
final DataByteArray dba = (DataByteArray) t.get(0);
final Memory sketch = Memory.wrap(dba.get());
union_.update(sketch, SERDE);
}
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class DataToSketch method accumulate.
// ACCUMULATOR INTERFACE
/**
***********************************************************************************************
* An <i>Accumulator</i> version of the standard <i>exec()</i> method. Like <i>exec()</i>,
* accumulator is called with a bag of Datum Tuples. Unlike <i>exec()</i>, it doesn't serialize the
* sketch at the end. Instead, it can be called multiple times, each time with another bag of
* Datum Tuples to be input to the sketch.
*
* @param inputTuple A tuple containing a single bag, containing Datum Tuples.
* @see #exec
* @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)"
* @throws IOException by Pig
*/
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
// throws is in API
if (accumUnion_ == null) {
accumUnion_ = DataToSketch.newUnion(nomEntries_, p_, seed_);
}
final DataBag bag = extractBag(inputTuple);
if (bag == null) {
return;
}
updateUnion(bag, accumUnion_);
}
Aggregations