use of org.apache.pig.data.DataBag in project akela by mozilla-metrics.
the class BloomFilterDistinctCount method exec.
@Override
public Integer exec(Tuple input) throws IOException {
if (input.size() != 1) {
throw new RuntimeException("Expected input to have only a single field");
}
if (input.getType(0) != DataType.BAG) {
throw new RuntimeException("Expected a BAG as input");
}
// guava bloom
BloomFilter<CharSequence> filter = BloomFilter.create(Funnels.stringFunnel(), n, p);
// hadoop bloom
//BloomFilter filter = new BloomFilter(m, k, Hash.MURMUR_HASH);
int uniq = 0;
DataBag db = (DataBag) input.get(0);
for (Iterator<Tuple> iter = db.iterator(); iter.hasNext(); ) {
Tuple t = iter.next();
if (!filter.mightContain((String) t.get(0))) {
filter.put((String) t.get(0));
//filter.add(t);
uniq++;
}
}
return uniq;
}
use of org.apache.pig.data.DataBag in project akela by mozilla-metrics.
the class ConvertBagToTuple method exec.
@Override
public Tuple exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
DataBag db = (DataBag) input.get(0);
Iterator<Tuple> iter = db.iterator();
Tuple output = tupleFactory.newTuple();
while (iter.hasNext()) {
Tuple t = iter.next();
for (Object o : t.getAll()) {
output.append(o);
}
}
return output;
}
use of org.apache.pig.data.DataBag in project akela by mozilla-metrics.
the class ConvertMapToBag method exec.
@SuppressWarnings("unchecked")
@Override
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
Map<Object, Object> m = (Map<Object, Object>) input.get(0);
DataBag output = bagFactory.newDefaultBag();
if (m != null) {
for (Map.Entry<Object, Object> entry : m.entrySet()) {
Tuple t = tupleFactory.newTuple(2);
t.set(0, entry.getKey());
t.set(1, entry.getValue());
output.add(t);
}
}
return output;
}
use of org.apache.pig.data.DataBag in project akela by mozilla-metrics.
the class JsonMap method convertListToBag.
/**
* Converts List objects to DataBag to keep Pig happy
*
* @param l
* @return
*/
@SuppressWarnings("unchecked")
private DataBag convertListToBag(List<Object> l) {
DataBag dbag = bagFactory.newDefaultBag();
Tuple t = tupleFactory.newTuple();
for (Object o : l) {
if (o instanceof List) {
dbag.addAll(convertListToBag((List<Object>) o));
} else {
t.append(o);
}
}
if (t.size() > 0) {
dbag.add(t);
}
return dbag;
}
use of org.apache.pig.data.DataBag in project varaha by thedatachef.
the class StanfordTokenize method exec.
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 1 || input.isNull(0))
return null;
// Output bag
DataBag bagOfTokens = bagFactory.newDefaultBag();
StringReader textInput = new StringReader(input.get(0).toString());
PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), "");
for (CoreLabel label; ptbt.hasNext(); ) {
label = (CoreLabel) ptbt.next();
Tuple termText = tupleFactory.newTuple(label.toString());
bagOfTokens.add(termText);
}
return bagOfTokens;
}
Aggregations