use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class ReservoirSamplingTest method execTest.
@Test
public void execTest() throws IOException {
// copies tests for accumulate() since that handles both data paths
final int k = 32;
final long n = 24;
final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
final TupleFactory tf = TupleFactory.getInstance();
for (long i = 0; i < n; ++i) {
final Tuple t = tf.newTuple(2);
t.set(0, i);
t.set(1, Long.toString(-i));
inputBag.add(t);
}
final Tuple input = tf.newTuple(inputBag);
final ReservoirSampling rs = new ReservoirSampling(Integer.toString(k));
Tuple result = rs.exec(input);
assertEquals(result.size(), 3, "Incorrect output size");
assertEquals(result.get(0), n, "Incorrect number of samples seen");
assertEquals(result.get(1), k, "Incorrect value of k");
assertEquals(((DataBag) result.get(2)).size(), n);
// add another n to the bag and repeat
for (long i = n; i < 2 * n; ++i) {
final Tuple t = tf.newTuple(2);
t.set(0, i);
t.set(1, Long.toString(-i));
inputBag.add(t);
}
result = rs.exec(input);
assertEquals(result.get(0), 2 * n, "Incorrect number of samples seen");
// unchanged
assertEquals(result.get(1), k, "Incorrect value of k");
assertEquals(((DataBag) result.get(2)).size(), Math.min(k, 2 * n));
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class ReservoirSamplingTest method accumulateTest.
@Test
public void accumulateTest() throws IOException {
// exec() is automatically composed by calling accumulate(), getValue(), and cleanup(), in order
// since AccumulateEvalFunc, but includes a fast-return route so still need to test separately
final int k = 32;
final long n = 24;
final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
final TupleFactory tf = TupleFactory.getInstance();
for (long i = 0; i < n; ++i) {
final Tuple t = tf.newTuple(2);
t.set(0, i);
t.set(1, Long.toString(-i));
inputBag.add(t);
}
final Tuple input = tf.newTuple(inputBag);
final ReservoirSampling rs = new ReservoirSampling(Integer.toString(k));
rs.accumulate(input);
Tuple result = rs.getValue();
assertEquals(result.size(), 3, "Incorrect output size");
assertEquals(result.get(0), n, "Incorrect number of samples seen");
assertEquals(result.get(1), k, "Incorrect value of k");
assertEquals(((DataBag) result.get(2)).size(), n);
// run the same input through again
rs.accumulate(input);
result = rs.getValue();
assertEquals(result.get(0), 2 * n, "Incorrect number of samples seen");
// unchanged
assertEquals(result.get(1), k, "Incorrect value of k");
assertEquals(((DataBag) result.get(2)).size(), Math.min(k, 2 * n));
// clean up, degenerate accumulate, then get value again
rs.cleanup();
rs.accumulate(null);
assertNull(rs.getValue());
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class ReservoirSamplingTest method generateDataBag.
static DataBag generateDataBag(final long numItems, final int startIdx) {
final DataBag output = BagFactory.getInstance().newDefaultBag();
try {
for (int i = 0; i < numItems; ++i) {
final Tuple t = TupleFactory.getInstance().newTuple(2);
final int val = startIdx + i;
t.set(0, val);
t.set(1, Integer.toString(-val));
output.add(t);
}
} catch (final ExecException e) {
fail(e.getMessage());
}
return output;
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class ReservoirSamplingTest method intermediateFinalExec.
@Test
public void intermediateFinalExec() throws IOException {
final int maxK = 128;
final EvalFunc<Tuple> rs = new ReservoirSampling.IntermediateFinal(Integer.toString(maxK));
// need at least 3 conditions:
// 1. n <= k <= maxK
// 2. n <= k, k > maxK
// 3. n > k
final DataBag bagOfReservoirs = BagFactory.getInstance().newDefaultBag();
Tuple t = TupleFactory.getInstance().newTuple(3);
t.set(0, 32L);
t.set(1, maxK);
t.set(2, generateDataBag(32, 0));
bagOfReservoirs.add(t);
t = TupleFactory.getInstance().newTuple(3);
t.set(0, 64L);
t.set(1, 256);
t.set(2, generateDataBag(64, 32));
bagOfReservoirs.add(t);
t = TupleFactory.getInstance().newTuple(3);
t.set(0, 256L);
t.set(1, maxK);
t.set(2, generateDataBag(maxK, 96));
bagOfReservoirs.add(t);
final Tuple input = TupleFactory.getInstance().newTuple(1);
input.set(0, bagOfReservoirs);
final Tuple result = rs.exec(input);
final long tgtN = 32 + 64 + 256;
// only added maxK to last bag
final int tgtMaxVal = 32 + 64 + maxK;
assertEquals(result.size(), 3, "Incorrect output size");
assertEquals(result.get(0), tgtN, "Incorrect number of samples seen");
assertEquals(result.get(1), maxK, "Incorrect value of k");
assertEquals(((DataBag) result.get(2)).size(), maxK);
// check that they're all in the target range
for (Tuple sample : ((DataBag) result.get(2))) {
final int val = (int) sample.get(0);
if (val < 0 || val >= tgtMaxVal) {
fail("Found value (" + val + ") outside target range [0, " + tgtMaxVal + "]");
}
}
}
use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.
the class ReservoirUnionTest method accumulateTest.
@Test
public void accumulateTest() {
try {
final long n = 20;
final int k = 64;
final Tuple reservoir1 = TupleFactory.getInstance().newTuple(3);
reservoir1.set(0, n);
reservoir1.set(1, k);
reservoir1.set(2, ReservoirSamplingTest.generateDataBag(n, 0));
final Tuple reservoir2 = TupleFactory.getInstance().newTuple(3);
reservoir2.set(0, n);
reservoir2.set(1, k);
reservoir2.set(2, ReservoirSamplingTest.generateDataBag(n, (int) n));
final Tuple reservoir3 = TupleFactory.getInstance().newTuple(3);
reservoir3.set(0, n);
reservoir3.set(1, k);
reservoir3.set(2, ReservoirSamplingTest.generateDataBag(n, (int) (2 * n)));
final DataBag bag1 = BagFactory.getInstance().newDefaultBag();
bag1.add(reservoir1);
bag1.add(reservoir2);
final Tuple input1 = TupleFactory.getInstance().newTuple(bag1);
final DataBag bag2 = BagFactory.getInstance().newDefaultBag();
bag2.add(reservoir3);
final Tuple input2 = TupleFactory.getInstance().newTuple(bag2);
final ReservoirUnion ru = new ReservoirUnion(Integer.toString(k));
ru.accumulate(input1);
ru.accumulate(input2);
final Tuple result = ru.getValue();
// assuming k >= 3n so all items still in reservoir, in order
assertEquals(result.size(), 3, "Unexpected tuple size from UDF");
assertEquals((long) result.get(0), 3 * n, "Incorrect total number of items seen");
assertEquals((int) result.get(1), k, "Unexpected value of k");
final DataBag outputSamples = (DataBag) result.get(2);
assertEquals(outputSamples.size(), (long) result.get(0), "Output reservoir size does not match reported number of items");
int i = 0;
for (Tuple t : outputSamples) {
// expected format: (i:int, -i:chararray)
assertEquals((int) t.get(0), i);
assertEquals((String) t.get(1), Integer.toString(-i));
++i;
}
ru.cleanup();
assertNull(ru.getValue());
} catch (final IOException e) {
fail("Unexpected exception");
}
}
Aggregations