use of cascading.flow.Flow in project parquet-mr by apache.
the class ParquetScroogeSchemeTest method doRead.
private void doRead() throws Exception {
Path path = new Path(txtOutputPath);
final FileSystem fs = path.getFileSystem(new Configuration());
if (fs.exists(path))
fs.delete(path, true);
Scheme sourceScheme = new ParquetScroogeScheme<Name>(Name.class);
Tap source = new Hfs(sourceScheme, parquetOutputPath);
Scheme sinkScheme = new TextLine(new Fields("first", "last"));
Tap sink = new Hfs(sinkScheme, txtOutputPath);
Pipe assembly = new Pipe("namecp");
assembly = new Each(assembly, new UnpackThriftFunction());
Flow flow = new HadoopFlowConnector().connect("namecp", source, sink, assembly);
flow.complete();
String result = FileUtils.readFileToString(new File(txtOutputPath + "/part-00000"));
assertEquals("0\tAlice\tPractice\n15\tBob\tHope\n24\tCharlie\tHorse\n", result);
}
use of cascading.flow.Flow in project parquet-mr by apache.
the class TestParquetTupleScheme method testFieldProjection.
@Test
public void testFieldProjection() throws Exception {
createFileForRead();
Path path = new Path(txtOutputPath);
final FileSystem fs = path.getFileSystem(new Configuration());
if (fs.exists(path))
fs.delete(path, true);
Scheme sourceScheme = new ParquetTupleScheme(new Fields("last_name"));
Tap source = new Hfs(sourceScheme, parquetInputPath);
Scheme sinkScheme = new TextLine(new Fields("last_name"));
Tap sink = new Hfs(sinkScheme, txtOutputPath);
Pipe assembly = new Pipe("namecp");
assembly = new Each(assembly, new ProjectedTupleFunction());
Flow flow = new HadoopFlowConnector().connect("namecp", source, sink, assembly);
flow.complete();
String result = FileUtils.readFileToString(new File(txtOutputPath + "/part-00000"));
assertEquals("Practice\nHope\nHorse\n", result);
}
use of cascading.flow.Flow in project parquet-mr by apache.
the class TestParquetTupleScheme method testReadWrite.
public void testReadWrite(String inputPath) throws Exception {
createFileForRead();
Path path = new Path(txtOutputPath);
final FileSystem fs = path.getFileSystem(new Configuration());
if (fs.exists(path))
fs.delete(path, true);
Scheme sourceScheme = new ParquetTupleScheme(new Fields("first_name", "last_name"));
Tap source = new Hfs(sourceScheme, inputPath);
Scheme sinkScheme = new TextLine(new Fields("first", "last"));
Tap sink = new Hfs(sinkScheme, txtOutputPath);
Pipe assembly = new Pipe("namecp");
assembly = new Each(assembly, new UnpackTupleFunction());
Flow flow = new HadoopFlowConnector().connect("namecp", source, sink, assembly);
flow.complete();
String result = FileUtils.readFileToString(new File(txtOutputPath + "/part-00000"));
assertEquals("Alice\tPractice\nBob\tHope\nCharlie\tHorse\n", result);
}
use of cascading.flow.Flow in project ambrose by twitter.
the class Main method main.
public static void main(String[] args) {
String docPath = "src/test/data/rain.txt";
String wcPath = "output/out";
String stopPath = "src/test/data/en.stop";
String tfidfPath = "output/out2";
Properties properties = new Properties();
AppProps.setApplicationJarClass(properties, Main.class);
HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties);
// create source and sink taps
Tap docTap = new Hfs(new TextDelimited(true, "\t"), docPath);
Tap wcTap = new Hfs(new TextDelimited(true, "\t"), wcPath);
Fields stop = new Fields("stop");
Tap stopTap = new Hfs(new TextDelimited(stop, true, "\t"), stopPath);
Tap tfidfTap = new Hfs(new TextDelimited(true, "\t"), tfidfPath);
// specify a regex operation to split the "document" text lines into a token stream
Fields token = new Fields("token");
Fields text = new Fields("text");
RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\\(\\),.]");
Fields fieldSelector = new Fields("doc_id", "token");
Pipe docPipe = new Each("token", text, splitter, fieldSelector);
// define "ScrubFunction" to clean up the token stream
Fields scrubArguments = new Fields("doc_id", "token");
docPipe = new Each(docPipe, scrubArguments, new ScrubFunction(scrubArguments), Fields.RESULTS);
// perform a left join to remove stop words, discarding the rows
// which joined with stop words, i.e., were non-null after left join
Pipe stopPipe = new Pipe("stop");
Pipe tokenPipe = new HashJoin(docPipe, token, stopPipe, stop, new LeftJoin());
tokenPipe = new Each(tokenPipe, stop, new RegexFilter("^$"));
tokenPipe = new Retain(tokenPipe, fieldSelector);
// one branch of the flow tallies the token counts for term frequency (TF)
Pipe tfPipe = new Pipe("TF", tokenPipe);
Fields tf_count = new Fields("tf_count");
tfPipe = new CountBy(tfPipe, new Fields("doc_id", "token"), tf_count);
Fields tf_token = new Fields("tf_token");
tfPipe = new Rename(tfPipe, token, tf_token);
// one branch counts the number of documents (D)
Fields doc_id = new Fields("doc_id");
Fields tally = new Fields("tally");
Fields rhs_join = new Fields("rhs_join");
Fields n_docs = new Fields("n_docs");
Pipe dPipe = new Unique("D", tokenPipe, doc_id);
dPipe = new Each(dPipe, new Insert(tally, 1), Fields.ALL);
dPipe = new Each(dPipe, new Insert(rhs_join, 1), Fields.ALL);
dPipe = new SumBy(dPipe, rhs_join, tally, n_docs, long.class);
// one branch tallies the token counts for document frequency (DF)
Pipe dfPipe = new Unique("DF", tokenPipe, Fields.ALL);
Fields df_count = new Fields("df_count");
dfPipe = new CountBy(dfPipe, token, df_count);
Fields df_token = new Fields("df_token");
Fields lhs_join = new Fields("lhs_join");
dfPipe = new Rename(dfPipe, token, df_token);
dfPipe = new Each(dfPipe, new Insert(lhs_join, 1), Fields.ALL);
// join to bring together all the components for calculating TF-IDF
// the D side of the join is smaller, so it goes on the RHS
Pipe idfPipe = new HashJoin(dfPipe, lhs_join, dPipe, rhs_join);
// the IDF side of the join is smaller, so it goes on the RHS
Pipe tfidfPipe = new CoGroup(tfPipe, tf_token, idfPipe, df_token);
// calculate the TF-IDF weights, per token, per document
Fields tfidf = new Fields("tfidf");
String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
ExpressionFunction tfidfExpression = new ExpressionFunction(tfidf, expression, Double.class);
Fields tfidfArguments = new Fields("tf_count", "df_count", "n_docs");
tfidfPipe = new Each(tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL);
fieldSelector = new Fields("tf_token", "doc_id", "tfidf");
tfidfPipe = new Retain(tfidfPipe, fieldSelector);
tfidfPipe = new Rename(tfidfPipe, tf_token, token);
// keep track of the word counts, which are useful for QA
Pipe wcPipe = new Pipe("wc", tfPipe);
Fields count = new Fields("count");
wcPipe = new SumBy(wcPipe, tf_token, tf_count, count, long.class);
wcPipe = new Rename(wcPipe, tf_token, token);
// additionally, sort by count
wcPipe = new GroupBy(wcPipe, count, count);
// connect the taps, pipes, etc., into a flow
FlowDef flowDef = FlowDef.flowDef().setName("tfidf").addSource(docPipe, docTap).addSource(stopPipe, stopTap).addTailSink(tfidfPipe, tfidfTap).addTailSink(wcPipe, wcTap);
// run ambrose and cascading
Flow tfidfFlow = flowConnector.connect(flowDef);
EmbeddedAmbroseCascadingNotifier server = new EmbeddedAmbroseCascadingNotifier();
tfidfFlow.addListener(server);
tfidfFlow.addStepListener(server);
tfidfFlow.complete();
}
Aggregations