use of org.apache.crunch.io.ReadableSourceTarget in project crunch by cloudera.
the class TermFrequencyTest method run.
public void run(Pipeline pipeline, PTypeFamily typeFamily, boolean transformTF) throws IOException {
String input = FileHelper.createTempCopyOf("docs.txt");
File transformedOutput = FileHelper.createOutputPath();
File tfOutput = FileHelper.createOutputPath();
PCollection<String> docs = pipeline.readTextFile(input);
PTypeFamily ptf = docs.getTypeFamily();
/*
* Input: String
* Input title text
*
* Output: PTable<Pair<String, String>, Long>
* Pair<Pair<word, title>, count in title>
*/
PTable<Pair<String, String>, Long> tf = Aggregate.count(docs.parallelDo("term document frequency", new DoFn<String, Pair<String, String>>() {
@Override
public void process(String doc, Emitter<Pair<String, String>> emitter) {
String[] kv = doc.split("\t");
String title = kv[0];
String text = kv[1];
for (String word : text.split("\\W+")) {
if (word.length() > 0) {
Pair<String, String> pair = Pair.of(word.toLowerCase(), title);
emitter.emit(pair);
}
}
}
}, ptf.pairs(ptf.strings(), ptf.strings())));
if (transformTF) {
/*
* Input: Pair<Pair<String, String>, Long>
* Pair<Pair<word, title>, count in title>
*
* Output: PTable<String, Pair<String, Long>>
* PTable<word, Pair<title, count in title>>
*/
PTable<String, Pair<String, Long>> wordDocumentCountPair = tf.parallelDo("transform wordDocumentPairCount", new MapFn<Pair<Pair<String, String>, Long>, Pair<String, Pair<String, Long>>>() {
@Override
public Pair<String, Pair<String, Long>> map(Pair<Pair<String, String>, Long> input) {
Pair<String, String> wordDocumentPair = input.first();
return Pair.of(wordDocumentPair.first(), Pair.of(wordDocumentPair.second(), input.second()));
}
}, ptf.tableOf(ptf.strings(), ptf.pairs(ptf.strings(), ptf.longs())));
pipeline.writeTextFile(wordDocumentCountPair, transformedOutput.getAbsolutePath());
}
SourceTarget<String> st = At.textFile(tfOutput.getAbsolutePath());
pipeline.write(tf, st);
pipeline.run();
// test the case we should see
Iterable<String> lines = ((ReadableSourceTarget<String>) st).read(pipeline.getConfiguration());
boolean passed = false;
for (String line : lines) {
if ("[well,A]\t0".equals(line)) {
fail("Found " + line + " but well is in Document A 1 time");
}
if ("[well,A]\t1".equals(line)) {
passed = true;
}
}
assertTrue(passed);
pipeline.done();
}
use of org.apache.crunch.io.ReadableSourceTarget in project crunch by cloudera.
the class MRPipeline method getMaterializeSourceTarget.
/**
* Retrieve a ReadableSourceTarget that provides access to the contents of a
* {@link PCollection}. This is primarily intended as a helper method to
* {@link #materialize(PCollection)}. The underlying data of the
* ReadableSourceTarget may not be actually present until the pipeline is run.
*
* @param pcollection
* The collection for which the ReadableSourceTarget is to be
* retrieved
* @return The ReadableSourceTarget
* @throws IllegalArgumentException
* If no ReadableSourceTarget can be retrieved for the given
* PCollection
*/
public <T> ReadableSourceTarget<T> getMaterializeSourceTarget(PCollection<T> pcollection) {
PCollectionImpl<T> impl = toPcollectionImpl(pcollection);
SourceTarget<T> matTarget = impl.getMaterializedAt();
if (matTarget != null && matTarget instanceof ReadableSourceTarget) {
return (ReadableSourceTarget<T>) matTarget;
}
ReadableSourceTarget<T> srcTarget = null;
if (outputTargets.containsKey(pcollection)) {
for (Target target : outputTargets.get(impl)) {
if (target instanceof ReadableSourceTarget) {
srcTarget = (ReadableSourceTarget<T>) target;
break;
}
}
}
if (srcTarget == null) {
SourceTarget<T> st = createIntermediateOutput(pcollection.getPType());
if (!(st instanceof ReadableSourceTarget)) {
throw new IllegalArgumentException("The PType for the given PCollection is not readable" + " and cannot be materialized");
} else {
srcTarget = (ReadableSourceTarget<T>) st;
addOutput(impl, srcTarget);
}
}
return srcTarget;
}
Aggregations