Search in sources :

Example 1 with ReadableSourceTarget

use of org.apache.crunch.io.ReadableSourceTarget in project crunch by cloudera.

the class TermFrequencyTest method run.

public void run(Pipeline pipeline, PTypeFamily typeFamily, boolean transformTF) throws IOException {
    String input = FileHelper.createTempCopyOf("docs.txt");
    File transformedOutput = FileHelper.createOutputPath();
    File tfOutput = FileHelper.createOutputPath();
    PCollection<String> docs = pipeline.readTextFile(input);
    PTypeFamily ptf = docs.getTypeFamily();
    /*
     * Input: String
     * Input title  text
     * 
     * Output: PTable<Pair<String, String>, Long> 
     * Pair<Pair<word, title>, count in title>
     */
    PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency", new DoFn<String, Pair<String, String>>() {

        @Override
        public void process(String doc, Emitter<Pair<String, String>> emitter) {
            String[] kv = doc.split("\t");
            String title = kv[0];
            String text = kv[1];
            for (String word : text.split("\\W+")) {
                if (word.length() > 0) {
                    Pair<String, String> pair = Pair.of(word.toLowerCase(), title);
                    emitter.emit(pair);
                }
            }
        }
    }, ptf.pairs(ptf.strings(), ptf.strings())));
    if (transformTF) {
        /*
       * Input: Pair<Pair<String, String>, Long>
       * Pair<Pair<word, title>, count in title>
       * 
       * Output: PTable<String, Pair<String, Long>>
       * PTable<word, Pair<title, count in title>>
       */
        PTable<String, Pair<String, Long>> wordDocumentCountPair = tf.parallelDo("transform wordDocumentPairCount", new MapFn<Pair<Pair<String, String>, Long>, Pair<String, Pair<String, Long>>>() {

            @Override
            public Pair<String, Pair<String, Long>> map(Pair<Pair<String, String>, Long> input) {
                Pair<String, String> wordDocumentPair = input.first();
                return Pair.of(wordDocumentPair.first(), Pair.of(wordDocumentPair.second(), input.second()));
            }
        }, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.strings(), ptf.longs())));
        pipeline.writeTextFile(wordDocumentCountPair, transformedOutput.getAbsolutePath());
    }
    SourceTarget<String> st = At.textFile(tfOutput.getAbsolutePath());
    pipeline.write(tf, st);
    pipeline.run();
    // test the case we should see
    Iterable<String> lines = ((ReadableSourceTarget<String>) st).read(pipeline.getConfiguration());
    boolean passed = false;
    for (String line : lines) {
        if ("[well,A]\t0".equals(line)) {
            fail("Found " + line + " but well is in Document A 1 time");
        }
        if ("[well,A]\t1".equals(line)) {
            passed = true;
        }
    }
    assertTrue(passed);
    pipeline.done();
}
Also used : ReadableSourceTarget(org.apache.crunch.io.ReadableSourceTarget) PTypeFamily(org.apache.crunch.types.PTypeFamily) File(java.io.File)

Example 2 with ReadableSourceTarget

use of org.apache.crunch.io.ReadableSourceTarget in project crunch by cloudera.

the class MRPipeline method getMaterializeSourceTarget.

/**
 * Retrieve a ReadableSourceTarget that provides access to the contents of a
 * {@link PCollection}. This is primarily intended as a helper method to
 * {@link #materialize(PCollection)}. The underlying data of the
 * ReadableSourceTarget may not be actually present until the pipeline is run.
 *
 * @param pcollection
 *          The collection for which the ReadableSourceTarget is to be
 *          retrieved
 * @return The ReadableSourceTarget
 * @throws IllegalArgumentException
 *           If no ReadableSourceTarget can be retrieved for the given
 *           PCollection
 */
public <T> ReadableSourceTarget<T> getMaterializeSourceTarget(PCollection<T> pcollection) {
    PCollectionImpl<T> impl = toPcollectionImpl(pcollection);
    SourceTarget<T> matTarget = impl.getMaterializedAt();
    if (matTarget != null && matTarget instanceof ReadableSourceTarget) {
        return (ReadableSourceTarget<T>) matTarget;
    }
    ReadableSourceTarget<T> srcTarget = null;
    if (outputTargets.containsKey(pcollection)) {
        for (Target target : outputTargets.get(impl)) {
            if (target instanceof ReadableSourceTarget) {
                srcTarget = (ReadableSourceTarget<T>) target;
                break;
            }
        }
    }
    if (srcTarget == null) {
        SourceTarget<T> st = createIntermediateOutput(pcollection.getPType());
        if (!(st instanceof ReadableSourceTarget)) {
            throw new IllegalArgumentException("The PType for the given PCollection is not readable" + " and cannot be materialized");
        } else {
            srcTarget = (ReadableSourceTarget<T>) st;
            addOutput(impl, srcTarget);
        }
    }
    return srcTarget;
}
Also used : SourceTarget(org.apache.crunch.SourceTarget) Target(org.apache.crunch.Target) ReadableSourceTarget(org.apache.crunch.io.ReadableSourceTarget) ReadableSourceTarget(org.apache.crunch.io.ReadableSourceTarget)

Aggregations

ReadableSourceTarget (org.apache.crunch.io.ReadableSourceTarget)2 File (java.io.File)1 SourceTarget (org.apache.crunch.SourceTarget)1 Target (org.apache.crunch.Target)1 PTypeFamily (org.apache.crunch.types.PTypeFamily)1