use of org.apache.pig.data.DataBag in project varaha by thedatachef.
the class TermVectorCentroid method exec.
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 1 || input.isNull(0))
return null;
DataBag bagOfVectors = (DataBag) input.get(0);
DataBag centroid = BagFactory.getInstance().newDefaultBag();
HashMap termSums = new HashMap<String, Double>();
//
for (Tuple t : bagOfVectors) {
DataBag v = (DataBag) t.get(0);
for (Tuple v_i : v) {
if (!(v_i.isNull(0) || v_i.isNull(1))) {
String term = v_i.get(0).toString();
Object currentValue = termSums.get(term);
if (currentValue == null) {
termSums.put(term, v_i.get(1));
} else {
termSums.put(term, (Double) v_i.get(1) + (Double) currentValue);
}
}
}
}
//
// Go back through the hashmap and make the values averages
//
Iterator mapIterator = termSums.entrySet().iterator();
while (mapIterator.hasNext()) {
Map.Entry pair = (Map.Entry) mapIterator.next();
Tuple termWeightPair = tupleFactory.newTuple(2);
termWeightPair.set(0, pair.getKey());
termWeightPair.set(1, (Double) pair.getValue() / bagOfVectors.size());
centroid.add(termWeightPair);
}
return centroid;
}
use of org.apache.pig.data.DataBag in project varaha by thedatachef.
the class TokenizeText method fillBag.
/**
Fills a DataBag with tokens from a TokenStream
*/
public DataBag fillBag(TokenStream stream) throws IOException {
DataBag result = bagFactory.newDefaultBag();
CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
try {
stream.reset();
while (stream.incrementToken()) {
if (termAttribute.length() > 0) {
Tuple termText = tupleFactory.newTuple(termAttribute.toString());
result.add(termText);
}
}
stream.end();
} finally {
stream.close();
}
return result;
}
Aggregations