use of org.apache.pig.data.DataBag in project mongo-hadoop by mongodb.
the class TOBAG method exec.
public DataBag exec(final Tuple input) throws IOException {
if (input == null || input.size() == 0) {
return null;
}
try {
DataBag output = mBagFactory.newDefaultBag();
Tuple nested = (Tuple) input.get(0);
for (Object o : nested.getAll()) {
output.add(mTupleFactory.newTuple(o));
}
return output;
} catch (Exception e) {
return null;
}
}
use of org.apache.pig.data.DataBag in project akela by mozilla-metrics.
the class Example method realexec.
private static Tuple realexec(Tuple input) throws IOException {
DataBag bag = (DataBag) input.get(0);
Iterator<Tuple> it = bag.iterator();
while (it.hasNext()) {
Tuple t = it.next();
if (t != null) {
return t;
}
}
return null;
}
use of org.apache.pig.data.DataBag in project akela by mozilla-metrics.
the class JsonMap method makeSafe.
/**
* Convert map and its values to types that Pig can handle
*
* @param m
* @return
*/
@SuppressWarnings("unchecked")
protected Map<String, Object> makeSafe(Map<String, Object> m) {
Map<String, Object> safeValues = new HashMap<String, Object>();
for (Map.Entry<String, Object> entry : m.entrySet()) {
Object v = entry.getValue();
if (v != null && v instanceof List) {
DataBag db = convertListToBag((List<Object>) v);
safeValues.put(entry.getKey(), db);
} else if (v != null && v instanceof Map) {
safeValues.put(entry.getKey(), makeSafe((Map<String, Object>) v));
} else {
safeValues.put(entry.getKey(), entry.getValue());
}
}
return safeValues;
}
use of org.apache.pig.data.DataBag in project common-crawl by matpalm.
the class Ngrams method bagify.
private DataBag bagify(Collection<String> ngrams) {
DataBag bag = mBagFactory.newDefaultBag();
for (String ngram : ngrams) if (ngramLength == 1)
bag.add(mTupleFactory.newTuple(ngram));
else
bag.add(mTupleFactory.newTuple(Lists.newArrayList(ngram.split(" "))));
return bag;
}
use of org.apache.pig.data.DataBag in project varaha by thedatachef.
the class TokenizeText method exec.
/**
Uses Lucene's StandardAnalyzer and tuns the tokens through several lucene filters
- LengthFilter: Filter individual words to be of length > minWordSize
- ShingleFilter: Converts word stream into n-gram stream
- PatternReplaceFilter: Removes the 'filler' character that ShingleFilter puts in to
replace stopwords
*/
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 1 || input.isNull(0))
return null;
TokenStream stream = analyzer.tokenStream(NOFIELD, input.get(0).toString());
// Let words be long
LengthFilter filtered = new LengthFilter(Version.LUCENE_44, stream, minWordSize, Integer.MAX_VALUE);
DataBag result;
if (minGramSize == 1 && maxGramSize == 1) {
result = fillBag(filtered);
} else {
ShingleFilter nGramStream = new ShingleFilter(filtered, minGramSize, maxGramSize);
nGramStream.setOutputUnigrams(outputUnigrams);
PatternReplaceFilter replacer = new PatternReplaceFilter(nGramStream, SHINGLE_FILLER, NOFIELD, true);
result = fillBag(replacer);
}
return result;
}
Aggregations