Search in sources :

Example 16 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class PageRankTest method testAvroMReflectInMemory.

@Test
public void testAvroMReflectInMemory() throws Exception {
    PTypeFamily tf = AvroTypeFamily.getInstance();
    PType<PageRankData> prType = Avros.reflects(PageRankData.class);
    run(MemPipeline.getInstance(), prType, tf);
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Test(org.junit.Test)

Example 17 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class TermFrequencyTest method run.

public void run(Pipeline pipeline, PTypeFamily typeFamily, boolean transformTF) throws IOException {
    String input = FileHelper.createTempCopyOf("docs.txt");
    File transformedOutput = FileHelper.createOutputPath();
    File tfOutput = FileHelper.createOutputPath();
    PCollection<String> docs = pipeline.readTextFile(input);
    PTypeFamily ptf = docs.getTypeFamily();
    /*
     * Input: String
     * Input title  text
     * 
     * Output: PTable<Pair<String, String>, Long> 
     * Pair<Pair<word, title>, count in title>
     */
    PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency", new DoFn<String, Pair<String, String>>() {

        @Override
        public void process(String doc, Emitter<Pair<String, String>> emitter) {
            String[] kv = doc.split("\t");
            String title = kv[0];
            String text = kv[1];
            for (String word : text.split("\\W+")) {
                if (word.length() > 0) {
                    Pair<String, String> pair = Pair.of(word.toLowerCase(), title);
                    emitter.emit(pair);
                }
            }
        }
    }, ptf.pairs(ptf.strings(), ptf.strings())));
    if (transformTF) {
        /*
       * Input: Pair<Pair<String, String>, Long>
       * Pair<Pair<word, title>, count in title>
       * 
       * Output: PTable<String, Pair<String, Long>>
       * PTable<word, Pair<title, count in title>>
       */
        PTable<String, Pair<String, Long>> wordDocumentCountPair = tf.parallelDo("transform wordDocumentPairCount", new MapFn<Pair<Pair<String, String>, Long>, Pair<String, Pair<String, Long>>>() {

            @Override
            public Pair<String, Pair<String, Long>> map(Pair<Pair<String, String>, Long> input) {
                Pair<String, String> wordDocumentPair = input.first();
                return Pair.of(wordDocumentPair.first(), Pair.of(wordDocumentPair.second(), input.second()));
            }
        }, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.strings(), ptf.longs())));
        pipeline.writeTextFile(wordDocumentCountPair, transformedOutput.getAbsolutePath());
    }
    SourceTarget<String> st = At.textFile(tfOutput.getAbsolutePath());
    pipeline.write(tf, st);
    pipeline.run();
    // test the case we should see
    Iterable<String> lines = ((ReadableSourceTarget<String>) st).read(pipeline.getConfiguration());
    boolean passed = false;
    for (String line : lines) {
        if ("[well,A]\t0".equals(line)) {
            fail("Found " + line + " but well is in Document A 1 time");
        }
        if ("[well,A]\t1".equals(line)) {
            passed = true;
        }
    }
    assertTrue(passed);
    pipeline.done();
}
Also used : ReadableSourceTarget(org.apache.crunch.io.ReadableSourceTarget) PTypeFamily(org.apache.crunch.types.PTypeFamily) File(java.io.File)

Example 18 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class MaterializeToMapTest method testMRMaterializeToMap.

@Test
public void testMRMaterializeToMap() throws IOException {
    Pipeline p = new MRPipeline(MaterializeToMapTest.class);
    String inputFile = FileHelper.createTempCopyOf("set1.txt");
    PCollection<String> c = p.readTextFile(inputFile);
    PTypeFamily tf = c.getTypeFamily();
    PTable<Integer, String> t = c.parallelDo(new Set1Mapper(), tf.tableOf(tf.ints(), tf.strings()));
    Map<Integer, String> m = t.materializeToMap();
    assertMatches(m);
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) MRPipeline(org.apache.crunch.impl.mr.MRPipeline) MRPipeline(org.apache.crunch.impl.mr.MRPipeline) MemPipeline(org.apache.crunch.impl.mem.MemPipeline) Test(org.junit.Test)

Example 19 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Aggregate method max.

/**
 * Returns the largest numerical element from the input collection.
 */
public static <S> PCollection<S> max(PCollection<S> collect) {
    Class<S> clazz = collect.getPType().getTypeClass();
    if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
        throw new IllegalArgumentException("Can only get max for Comparable elements, not for: " + collect.getPType().getTypeClass());
    }
    PTypeFamily tf = collect.getTypeFamily();
    return PTables.values(collect.parallelDo("max", new DoFn<S, Pair<Boolean, S>>() {

        private transient S max = null;

        public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
            if (max == null || ((Comparable<S>) max).compareTo(input) < 0) {
                max = input;
            }
        }

        public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
            if (max != null) {
                emitter.emit(Pair.of(true, max));
            }
        }
    }, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1).combineValues(new CombineFn<Boolean, S>() {

        public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
            S max = null;
            for (S v : input.second()) {
                if (max == null || ((Comparable<S>) max).compareTo(v) < 0) {
                    max = v;
                }
            }
            emitter.emit(Pair.of(input.first(), max));
        }
    }));
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Emitter(org.apache.crunch.Emitter) DoFn(org.apache.crunch.DoFn) CombineFn(org.apache.crunch.CombineFn) Pair(org.apache.crunch.Pair)

Example 20 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Cartesian method cross.

/**
 * Performs a full cross join on the specified {@link PCollection}s (using the same strategy as Pig's CROSS operator).
 *
 * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross Join</a>
 * @param left A PCollection to perform a cross join on.
 * @param right A PCollection to perform a cross join on.
 * @param <U> Type of the first {@link PCollection}'s values
 * @param <V> Type of the second {@link PCollection}'s values
 * @return The joined result as tuples of (U,V).
 */
public static <U, V> PCollection<Pair<U, V>> cross(PCollection<U> left, PCollection<V> right, int parallelism) {
    PTypeFamily ltf = left.getTypeFamily();
    PTypeFamily rtf = right.getTypeFamily();
    PTableType<Pair<Integer, Integer>, U> ptt = ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType());
    if (ptt == null)
        throw new Error();
    PTable<Pair<Integer, Integer>, U> leftCross = left.parallelDo(new GFCross<U>(0, parallelism), ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType()));
    PTable<Pair<Integer, Integer>, V> rightCross = right.parallelDo(new GFCross<V>(1, parallelism), rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), right.getPType()));
    PTable<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>> cg = leftCross.cogroup(rightCross);
    PTypeFamily ctf = cg.getTypeFamily();
    return cg.parallelDo(new DoFn<Pair<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>>, Pair<U, V>>() {

        @Override
        public void process(Pair<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>> input, Emitter<Pair<U, V>> emitter) {
            for (U l : input.second().first()) {
                for (V r : input.second().second()) {
                    emitter.emit(Pair.of(l, r));
                }
            }
        }
    }, ctf.pairs(left.getPType(), right.getPType()));
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Collection(java.util.Collection) PCollection(org.apache.crunch.PCollection) Pair(org.apache.crunch.Pair)

Aggregations

PTypeFamily (org.apache.crunch.types.PTypeFamily)26 Pair (org.apache.crunch.Pair)15 GroupingOptions (org.apache.crunch.GroupingOptions)7 MRPipeline (org.apache.crunch.impl.mr.MRPipeline)7 Test (org.junit.Test)7 Configuration (org.apache.hadoop.conf.Configuration)6 Collection (java.util.Collection)5 PCollection (org.apache.crunch.PCollection)4 CombineFn (org.apache.crunch.CombineFn)2 DoFn (org.apache.crunch.DoFn)2 Emitter (org.apache.crunch.Emitter)2 Tuple3 (org.apache.crunch.Tuple3)2 File (java.io.File)1 List (java.util.List)1 Tuple4 (org.apache.crunch.Tuple4)1 TupleN (org.apache.crunch.TupleN)1 MemPipeline (org.apache.crunch.impl.mem.MemPipeline)1 CrunchRuntimeException (org.apache.crunch.impl.mr.run.CrunchRuntimeException)1 ReadableSourceTarget (org.apache.crunch.io.ReadableSourceTarget)1 SourcePathTargetImpl (org.apache.crunch.io.impl.SourcePathTargetImpl)1