Search in sources :

Example 91 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class FrequentStringsSketchToEstimatesTest method exact.

@Test
public void exact() throws Exception {
    EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
    ItemsSketch<String> sketch = new ItemsSketch<String>(8);
    sketch.update("a");
    sketch.update("a");
    sketch.update("b");
    Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
    DataBag bag = func.exec(inputTuple);
    Assert.assertNotNull(bag);
    Assert.assertEquals(bag.size(), 2);
    Iterator<Tuple> it = bag.iterator();
    Tuple tuple1 = it.next();
    Assert.assertEquals(tuple1.size(), 4);
    Assert.assertEquals((String) tuple1.get(0), "a");
    Assert.assertEquals((long) tuple1.get(1), 2L);
    Assert.assertEquals((long) tuple1.get(2), 2L);
    Assert.assertEquals((long) tuple1.get(3), 2L);
    Tuple tuple2 = it.next();
    Assert.assertEquals(tuple2.size(), 4);
    Assert.assertEquals((String) tuple2.get(0), "b");
    Assert.assertEquals((long) tuple2.get(1), 1L);
    Assert.assertEquals((long) tuple2.get(2), 1L);
    Assert.assertEquals((long) tuple2.get(3), 1L);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 92 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class UnionFrequentStringsSketchTest method accumulator.

@Test
public void accumulator() throws Exception {
    Accumulator<Tuple> func = new UnionFrequentStringsSketch("8");
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    {
        ItemsSketch<String> sketch = new ItemsSketch<String>(8);
        sketch.update("a");
        sketch.update("b");
        bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
    }
    func.accumulate(PigUtil.objectsToTuple(bag));
    bag = BagFactory.getInstance().newDefaultBag();
    {
        ItemsSketch<String> sketch = new ItemsSketch<String>(8);
        sketch.update("a");
        sketch.update("b");
        bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
    }
    func.accumulate(PigUtil.objectsToTuple(bag));
    Tuple resultTuple = func.getValue();
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    DataByteArray bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertFalse(sketch.isEmpty());
    Assert.assertEquals(sketch.getNumActiveItems(), 2);
    Assert.assertEquals(sketch.getEstimate("a"), 2);
    Assert.assertEquals(sketch.getEstimate("b"), 2);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 93 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class UnionFrequentStringsSketchTest method accumulatorEmptySketch.

@Test
public void accumulatorEmptySketch() throws Exception {
    Accumulator<Tuple> func = new UnionFrequentStringsSketch("8");
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    {
        ItemsSketch<String> sketch = new ItemsSketch<String>(8);
        bag.add(PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe()))));
    }
    func.accumulate(PigUtil.objectsToTuple(bag));
    Tuple resultTuple = func.getValue();
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    DataByteArray bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertTrue(sketch.isEmpty());
    Assert.assertEquals(sketch.getNumActiveItems(), 0);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 94 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class Union method accumulate.

// ACCUMULATOR INTERFACE
/**
 ***********************************************************************************************
 * An <i>Accumulator</i> version of the standard <i>exec()</i> method. Like <i>exec()</i>,
 * accumulator is called with a bag of Sketch Tuples. Unlike <i>exec()</i>, it doesn't serialize the
 * sketch at the end. Instead, it can be called multiple times, each time with another bag of
 * Sketch Tuples to be input to the Union.
 *
 * @param inputTuple A tuple containing a single bag, containing Sketch Tuples.
 * @see #exec
 * @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)"
 * @throws IOException by Pig
 */
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
    // throws is in API
    if (accumUnion_ == null) {
        accumUnion_ = SetOperation.builder().setP(p_).setSeed(seed_).setResizeFactor(RF).setNominalEntries(nomEntries_).buildUnion();
    }
    final DataBag bag = extractBag(inputTuple);
    if (bag == null) {
        return;
    }
    updateUnion(bag, accumUnion_);
}
Also used : DataBag(org.apache.pig.data.DataBag)

Example 95 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class Union method exec.

// @formatter:off
/**
 **********************************************************************************************
 * Top-level exec function.
 * This method accepts an input Tuple containing a Bag of one or more inner <b>Sketch Tuples</b>
 * and returns a single updated <b>Sketch</b> as a <b>Sketch Tuple</b>.
 *
 * <p>If a large number of calls are anticipated, leveraging either the <i>Algebraic</i> or
 * <i>Accumulator</i> interfaces is recommended. Pig normally handles this automatically.
 *
 * <p>Internally, this method presents the inner <b>Sketch Tuples</b> to a new <b>Union</b>.
 * The result is returned as a <b>Sketch Tuple</b>
 *
 * <p><b>Input Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Must contain only one field)
 *     <ul>
 *       <li>index 0: DataBag: BAG (May contain 0 or more Inner Tuples)
 *         <ul>
 *           <li>index 0: Tuple: TUPLE <b>Sketch Tuple</b></li>
 *           <li>...</li>
 *           <li>index n-1: Tuple: TUPLE <b>Sketch Tuple</b></li>
 *         </ul>
 *       </li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * <b>Sketch Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Contains exactly 1 field)
 *     <ul>
 *       <li>index 0: DataByteArray: BYTEARRAY = The serialization of a Sketch object.</li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * @param inputTuple A tuple containing a single bag, containing Sketch Tuples.
 * @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch (8 bytes).
 * @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)"
 */
// @formatter:on
// TOP LEVEL EXEC
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
    // throws is in API
    // The exec is a stateless function.  It operates on the input and returns a result.
    // It can only call static functions.
    final com.yahoo.sketches.theta.Union union = SetOperation.builder().setP(p_).setSeed(seed_).setResizeFactor(RF).setNominalEntries(nomEntries_).buildUnion();
    final DataBag bag = extractBag(inputTuple);
    if (bag == null) {
        // Configured with parent
        return emptyCompactOrderedSketchTuple_;
    }
    updateUnion(bag, union);
    final CompactSketch compactSketch = union.getResult(true, null);
    return compactOrderedSketchToTuple(compactSketch);
}
Also used : CompactSketch(com.yahoo.sketches.theta.CompactSketch) DataBag(org.apache.pig.data.DataBag)

Aggregations

DataBag (org.apache.pig.data.DataBag)266 Tuple (org.apache.pig.data.Tuple)223 Test (org.testng.annotations.Test)142 DataByteArray (org.apache.pig.data.DataByteArray)103 IOException (java.io.IOException)20 Estimate (com.yahoo.sketches.pig.theta.Estimate)19 EvalFunc (org.apache.pig.EvalFunc)16 HllSketch (com.yahoo.sketches.hll.HllSketch)14 DoubleSummary (com.yahoo.sketches.tuple.DoubleSummary)13 DoubleSummaryDeserializer (com.yahoo.sketches.tuple.DoubleSummaryDeserializer)13 Test (org.junit.Test)13 ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)12 ArrayOfDoublesSketch (com.yahoo.sketches.tuple.ArrayOfDoublesSketch)12 ExecException (org.apache.pig.backend.executionengine.ExecException)12 ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)11 ArrayOfDoublesUpdatableSketchBuilder (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder)11 Map (java.util.Map)11 ArrayOfDoublesUpdatableSketch (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch)10 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10