use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class PageRankTest method testAvroMReflectInMemory.
@Test
public void testAvroMReflectInMemory() throws Exception {
PTypeFamily tf = AvroTypeFamily.getInstance();
PType<PageRankData> prType = Avros.reflects(PageRankData.class);
run(MemPipeline.getInstance(), prType, tf);
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class TermFrequencyTest method run.
public void run(Pipeline pipeline, PTypeFamily typeFamily, boolean transformTF) throws IOException {
String input = FileHelper.createTempCopyOf("docs.txt");
File transformedOutput = FileHelper.createOutputPath();
File tfOutput = FileHelper.createOutputPath();
PCollection<String> docs = pipeline.readTextFile(input);
PTypeFamily ptf = docs.getTypeFamily();
/*
* Input: String
* Input title text
*
* Output: PTable<Pair<String, String>, Long>
* Pair<Pair<word, title>, count in title>
*/
PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency", new DoFn<String, Pair<String, String>>() {
@Override
public void process(String doc, Emitter<Pair<String, String>> emitter) {
String[] kv = doc.split("\t");
String title = kv[0];
String text = kv[1];
for (String word : text.split("\\W+")) {
if (word.length() > 0) {
Pair<String, String> pair = Pair.of(word.toLowerCase(), title);
emitter.emit(pair);
}
}
}
}, ptf.pairs(ptf.strings(), ptf.strings())));
if (transformTF) {
/*
* Input: Pair<Pair<String, String>, Long>
* Pair<Pair<word, title>, count in title>
*
* Output: PTable<String, Pair<String, Long>>
* PTable<word, Pair<title, count in title>>
*/
PTable<String, Pair<String, Long>> wordDocumentCountPair = tf.parallelDo("transform wordDocumentPairCount", new MapFn<Pair<Pair<String, String>, Long>, Pair<String, Pair<String, Long>>>() {
@Override
public Pair<String, Pair<String, Long>> map(Pair<Pair<String, String>, Long> input) {
Pair<String, String> wordDocumentPair = input.first();
return Pair.of(wordDocumentPair.first(), Pair.of(wordDocumentPair.second(), input.second()));
}
}, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.strings(), ptf.longs())));
pipeline.writeTextFile(wordDocumentCountPair, transformedOutput.getAbsolutePath());
}
SourceTarget<String> st = At.textFile(tfOutput.getAbsolutePath());
pipeline.write(tf, st);
pipeline.run();
// test the case we should see
Iterable<String> lines = ((ReadableSourceTarget<String>) st).read(pipeline.getConfiguration());
boolean passed = false;
for (String line : lines) {
if ("[well,A]\t0".equals(line)) {
fail("Found " + line + " but well is in Document A 1 time");
}
if ("[well,A]\t1".equals(line)) {
passed = true;
}
}
assertTrue(passed);
pipeline.done();
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class MaterializeToMapTest method testMRMaterializeToMap.
@Test
public void testMRMaterializeToMap() throws IOException {
Pipeline p = new MRPipeline(MaterializeToMapTest.class);
String inputFile = FileHelper.createTempCopyOf("set1.txt");
PCollection<String> c = p.readTextFile(inputFile);
PTypeFamily tf = c.getTypeFamily();
PTable<Integer, String> t = c.parallelDo(new Set1Mapper(), tf.tableOf(tf.ints(), tf.strings()));
Map<Integer, String> m = t.materializeToMap();
assertMatches(m);
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Aggregate method max.
/**
* Returns the largest numerical element from the input collection.
*/
public static <S> PCollection<S> max(PCollection<S> collect) {
Class<S> clazz = collect.getPType().getTypeClass();
if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
throw new IllegalArgumentException("Can only get max for Comparable elements, not for: " + collect.getPType().getTypeClass());
}
PTypeFamily tf = collect.getTypeFamily();
return PTables.values(collect.parallelDo("max", new DoFn<S, Pair<Boolean, S>>() {
private transient S max = null;
public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
if (max == null || ((Comparable<S>) max).compareTo(input) < 0) {
max = input;
}
}
public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
if (max != null) {
emitter.emit(Pair.of(true, max));
}
}
}, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1).combineValues(new CombineFn<Boolean, S>() {
public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
S max = null;
for (S v : input.second()) {
if (max == null || ((Comparable<S>) max).compareTo(v) < 0) {
max = v;
}
}
emitter.emit(Pair.of(input.first(), max));
}
}));
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Cartesian method cross.
/**
* Performs a full cross join on the specified {@link PCollection}s (using the same strategy as Pig's CROSS operator).
*
* @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross Join</a>
* @param left A PCollection to perform a cross join on.
* @param right A PCollection to perform a cross join on.
* @param <U> Type of the first {@link PCollection}'s values
* @param <V> Type of the second {@link PCollection}'s values
* @return The joined result as tuples of (U,V).
*/
public static <U, V> PCollection<Pair<U, V>> cross(PCollection<U> left, PCollection<V> right, int parallelism) {
PTypeFamily ltf = left.getTypeFamily();
PTypeFamily rtf = right.getTypeFamily();
PTableType<Pair<Integer, Integer>, U> ptt = ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType());
if (ptt == null)
throw new Error();
PTable<Pair<Integer, Integer>, U> leftCross = left.parallelDo(new GFCross<U>(0, parallelism), ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType()));
PTable<Pair<Integer, Integer>, V> rightCross = right.parallelDo(new GFCross<V>(1, parallelism), rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), right.getPType()));
PTable<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>> cg = leftCross.cogroup(rightCross);
PTypeFamily ctf = cg.getTypeFamily();
return cg.parallelDo(new DoFn<Pair<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>>, Pair<U, V>>() {
@Override
public void process(Pair<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>> input, Emitter<Pair<U, V>> emitter) {
for (U l : input.second().first()) {
for (V r : input.second().second()) {
emitter.emit(Pair.of(l, r));
}
}
}
}, ctf.pairs(left.getPType(), right.getPType()));
}
Aggregations