use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class ReservoirUnionTest method checkDegenerateInput.
@Test
public void checkDegenerateInput() {
// using default max k value
final ReservoirUnion ru = new ReservoirUnion();
Tuple inputTuple;
try {
// input == null
assertNull(ru.exec(null));
// input.size() < 1
inputTuple = TupleFactory.getInstance().newTuple(0);
assertNull(ru.exec(inputTuple));
// input.isNull(0);
inputTuple = TupleFactory.getInstance().newTuple(1);
inputTuple.set(0, null);
assertNull(ru.exec(inputTuple));
} catch (final IOException e) {
fail("Unexpected exception");
}
try {
// reservoir tuple with only 2 entries
final Tuple reservoir = TupleFactory.getInstance().newTuple(2);
reservoir.set(0, 256L);
reservoir.set(1, 256);
final DataBag reservoirBag = BagFactory.getInstance().newDefaultBag();
reservoirBag.add(reservoir);
inputTuple = TupleFactory.getInstance().newTuple(reservoirBag);
ru.exec(inputTuple);
fail("Did not catch expected ExecException");
} catch (final ExecException e) {
// expected
} catch (final IOException e) {
fail("Unexpected exception");
}
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class VarOptCommonAlgebraicTest method unionSketchesExec.
@Test
public void unionSketchesExec() {
// Only difference between UnionSketchesAsTuple and UnionSketchesAsByteArray is that one wraps
// the resulting serialized sketch in a tuple. If the union result is still in exact mode, the
// two sketches should be identical.
final int numSketches = 3;
// numSketches * numItemsPerSketch should be < k here
final int numItemsPerSketch = 10;
final int k = 100;
final String kStr = Integer.toString(k);
final VarOptCommonImpl.UnionSketchesAsTuple udfTuple;
final VarOptCommonImpl.UnionSketchesAsByteArray udfBA;
udfTuple = new VarOptCommonImpl.UnionSketchesAsTuple(kStr);
udfBA = new VarOptCommonImpl.UnionSketchesAsByteArray(kStr);
char id = 'a';
double wt = 1.0;
final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
final VarOptItemsUnion<Tuple> union = VarOptItemsUnion.newInstance(k);
final VarOptItemsSketch<Tuple> vis = VarOptItemsSketch.newInstance(k);
// compare against at the end.
try {
for (int j = 0; j < numSketches; ++j) {
vis.reset();
for (int i = 0; i < numItemsPerSketch; ++i) {
final Tuple t = TupleFactory.getInstance().newTuple(2);
t.set(0, Character.toString(id));
t.set(1, wt);
vis.update(t, wt);
++id;
wt += 1.0;
}
final Tuple wrapper = TupleFactory.getInstance().newTuple(1);
wrapper.set(0, new DataByteArray(vis.toByteArray(serDe_)));
inputBag.add(wrapper);
union.update(vis);
}
} catch (final ExecException e) {
fail("Unexpected ExecException creating input data");
}
try {
final Tuple inputTuple = TupleFactory.getInstance().newTuple(1);
inputTuple.set(0, inputBag);
final DataByteArray outArray = udfBA.exec(inputTuple);
final VarOptItemsSketch<Tuple> sketch1 = VarOptItemsSketch.heapify(Memory.wrap(outArray.get()), serDe_);
final Tuple outTuple = udfTuple.exec(inputTuple);
final DataByteArray dba = (DataByteArray) outTuple.get(0);
final VarOptItemsSketch<Tuple> sketch2 = VarOptItemsSketch.heapify(Memory.wrap(dba.get()), serDe_);
final VarOptItemsSketch<Tuple> expectedResult = union.getResult();
compareResults(sketch1, expectedResult);
compareResults(sketch2, expectedResult);
} catch (final IOException e) {
fail("Unexpected IOException calling exec()");
}
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class VarOptSamplingTest method algebraicFinal.
@Test
public void algebraicFinal() {
final int k = 87;
final int wtIdx = 2;
final VarOptSampling.Final udf = new VarOptSampling.Final(Integer.toString(k), Integer.toString(wtIdx));
final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
final VarOptItemsSketch<Tuple> vis = VarOptItemsSketch.newInstance(k);
inputBag.add(TupleFactory.getInstance().newTuple(new DataByteArray(vis.toByteArray(serDe_))));
final Tuple inputTuple = TupleFactory.getInstance().newTuple(inputBag);
try {
final DataBag result = udf.exec(inputTuple);
assertNotNull(result);
assertEquals(result.size(), 0);
} catch (final IOException e) {
fail("Unexpected IOException");
}
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class VarOptSamplingTest method standardAccumulate.
@Test
public void standardAccumulate() {
final int k = 10;
final VarOptSampling udf = new VarOptSampling(Integer.toString(k), "0");
final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
double cumWeight = 0.0;
try {
for (int i = 1; i < k; ++i) {
final Tuple t = TupleFactory.getInstance().newTuple(3);
t.set(0, 1.0 * i);
t.set(1, i);
t.set(2, -i);
inputBag.add(t);
cumWeight += i;
}
final Tuple inputTuple = TupleFactory.getInstance().newTuple(inputBag);
assertNull(udf.getValue());
udf.accumulate(inputTuple);
udf.accumulate(inputTuple);
final DataBag result = udf.getValue();
udf.cleanup();
assertNull(udf.getValue());
assertNotNull(result);
assertEquals(result.size(), k);
double cumResultWeight = 0.0;
for (Tuple weightAndtuple : result) {
cumResultWeight += (double) weightAndtuple.get(0);
final Tuple sample = (Tuple) weightAndtuple.get(1);
assertEquals(sample.size(), 3);
final int id = (int) sample.get(1);
assertTrue(id > 0 && id < k);
}
// called accumulate() twice
assertEquals(cumResultWeight, 2 * cumWeight, EPS);
} catch (final IOException e) {
fail("Unexpected exception");
}
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class VarOptUnionTest method checkExecution.
@Test
public void checkExecution() {
final int k = 5;
final VarOptUnion udf = new VarOptUnion(Integer.toString(k));
final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
final Tuple inputTuple = TupleFactory.getInstance().newTuple(1);
try {
final VarOptItemsSketch<Tuple> sketch = VarOptItemsSketch.newInstance(k);
final VarOptItemsUnion<Tuple> union = VarOptItemsUnion.newInstance(k);
for (int i = 1; i < k / 2; ++i) {
sketch.reset();
final Tuple t = TupleFactory.getInstance().newTuple(3);
t.set(0, 1.0 * i);
t.set(1, i);
t.set(2, -i);
sketch.update(t, 1.0 * i);
// serialize sketch and wrap in Tuple, add to both bag and union
final Tuple sketchWrapper = TupleFactory.getInstance().newTuple(1);
final DataByteArray dba = new DataByteArray(sketch.toByteArray(new ArrayOfTuplesSerDe()));
sketchWrapper.set(0, dba);
inputBag.add(sketchWrapper);
union.update(sketch);
// calling accumulate() twice later
union.update(sketch);
}
inputTuple.set(0, inputBag);
assertNull(udf.getValue());
udf.accumulate(inputTuple);
udf.accumulate(inputTuple);
final DataByteArray outBytes = udf.getValue();
udf.cleanup();
assertNull(udf.getValue());
final VarOptItemsSketch<Tuple> result = VarOptItemsSketch.heapify(Memory.wrap(outBytes.get()), new ArrayOfTuplesSerDe());
assertNotNull(result);
VarOptCommonAlgebraicTest.compareResults(result, union.getResult());
} catch (final IOException e) {
fail("Unexpected exception");
}
}
Aggregations