Search in sources :

Example 1 with HadoopFlowConnector

use of cascading.flow.hadoop.HadoopFlowConnector in project ambrose by twitter.

the class Main method main.

public static void main(String[] args) {
    String docPath = "src/test/data/rain.txt";
    String wcPath = "output/out";
    String stopPath = "src/test/data/en.stop";
    String tfidfPath = "output/out2";
    Properties properties = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    HadoopFlowConnector flowConnector = new HadoopFlowConnector(properties);
    // create source and sink taps
    Tap docTap = new Hfs(new TextDelimited(true, "\t"), docPath);
    Tap wcTap = new Hfs(new TextDelimited(true, "\t"), wcPath);
    Fields stop = new Fields("stop");
    Tap stopTap = new Hfs(new TextDelimited(stop, true, "\t"), stopPath);
    Tap tfidfTap = new Hfs(new TextDelimited(true, "\t"), tfidfPath);
    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields("token");
    Fields text = new Fields("text");
    RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\\(\\),.]");
    Fields fieldSelector = new Fields("doc_id", "token");
    Pipe docPipe = new Each("token", text, splitter, fieldSelector);
    // define "ScrubFunction" to clean up the token stream
    Fields scrubArguments = new Fields("doc_id", "token");
    docPipe = new Each(docPipe, scrubArguments, new ScrubFunction(scrubArguments), Fields.RESULTS);
    // perform a left join to remove stop words, discarding the rows
    // which joined with stop words, i.e., were non-null after left join
    Pipe stopPipe = new Pipe("stop");
    Pipe tokenPipe = new HashJoin(docPipe, token, stopPipe, stop, new LeftJoin());
    tokenPipe = new Each(tokenPipe, stop, new RegexFilter("^$"));
    tokenPipe = new Retain(tokenPipe, fieldSelector);
    // one branch of the flow tallies the token counts for term frequency (TF)
    Pipe tfPipe = new Pipe("TF", tokenPipe);
    Fields tf_count = new Fields("tf_count");
    tfPipe = new CountBy(tfPipe, new Fields("doc_id", "token"), tf_count);
    Fields tf_token = new Fields("tf_token");
    tfPipe = new Rename(tfPipe, token, tf_token);
    // one branch counts the number of documents (D)
    Fields doc_id = new Fields("doc_id");
    Fields tally = new Fields("tally");
    Fields rhs_join = new Fields("rhs_join");
    Fields n_docs = new Fields("n_docs");
    Pipe dPipe = new Unique("D", tokenPipe, doc_id);
    dPipe = new Each(dPipe, new Insert(tally, 1), Fields.ALL);
    dPipe = new Each(dPipe, new Insert(rhs_join, 1), Fields.ALL);
    dPipe = new SumBy(dPipe, rhs_join, tally, n_docs, long.class);
    // one branch tallies the token counts for document frequency (DF)
    Pipe dfPipe = new Unique("DF", tokenPipe, Fields.ALL);
    Fields df_count = new Fields("df_count");
    dfPipe = new CountBy(dfPipe, token, df_count);
    Fields df_token = new Fields("df_token");
    Fields lhs_join = new Fields("lhs_join");
    dfPipe = new Rename(dfPipe, token, df_token);
    dfPipe = new Each(dfPipe, new Insert(lhs_join, 1), Fields.ALL);
    // join to bring together all the components for calculating TF-IDF
    // the D side of the join is smaller, so it goes on the RHS
    Pipe idfPipe = new HashJoin(dfPipe, lhs_join, dPipe, rhs_join);
    // the IDF side of the join is smaller, so it goes on the RHS
    Pipe tfidfPipe = new CoGroup(tfPipe, tf_token, idfPipe, df_token);
    // calculate the TF-IDF weights, per token, per document
    Fields tfidf = new Fields("tfidf");
    String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
    ExpressionFunction tfidfExpression = new ExpressionFunction(tfidf, expression, Double.class);
    Fields tfidfArguments = new Fields("tf_count", "df_count", "n_docs");
    tfidfPipe = new Each(tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL);
    fieldSelector = new Fields("tf_token", "doc_id", "tfidf");
    tfidfPipe = new Retain(tfidfPipe, fieldSelector);
    tfidfPipe = new Rename(tfidfPipe, tf_token, token);
    // keep track of the word counts, which are useful for QA
    Pipe wcPipe = new Pipe("wc", tfPipe);
    Fields count = new Fields("count");
    wcPipe = new SumBy(wcPipe, tf_token, tf_count, count, long.class);
    wcPipe = new Rename(wcPipe, tf_token, token);
    // additionally, sort by count
    wcPipe = new GroupBy(wcPipe, count, count);
    // connect the taps, pipes, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef().setName("tfidf").addSource(docPipe, docTap).addSource(stopPipe, stopTap).addTailSink(tfidfPipe, tfidfTap).addTailSink(wcPipe, wcTap);
    //run ambrose and cascading
    Flow tfidfFlow = flowConnector.connect(flowDef);
    EmbeddedAmbroseCascadingNotifier server = new EmbeddedAmbroseCascadingNotifier();
    tfidfFlow.addListener(server);
    tfidfFlow.addStepListener(server);
    tfidfFlow.complete();
}
Also used : Each(cascading.pipe.Each) Tap(cascading.tap.Tap) FlowDef(cascading.flow.FlowDef) Properties(java.util.Properties) HadoopFlowConnector(cascading.flow.hadoop.HadoopFlowConnector) RegexFilter(cascading.operation.regex.RegexFilter) Insert(cascading.operation.Insert) RegexSplitGenerator(cascading.operation.regex.RegexSplitGenerator) HashJoin(cascading.pipe.HashJoin) Hfs(cascading.tap.hadoop.Hfs) Retain(cascading.pipe.assembly.Retain) CoGroup(cascading.pipe.CoGroup) GroupBy(cascading.pipe.GroupBy) EmbeddedAmbroseCascadingNotifier(com.twitter.ambrose.cascading.EmbeddedAmbroseCascadingNotifier) CountBy(cascading.pipe.assembly.CountBy) ExpressionFunction(cascading.operation.expression.ExpressionFunction) SumBy(cascading.pipe.assembly.SumBy) Pipe(cascading.pipe.Pipe) Rename(cascading.pipe.assembly.Rename) Flow(cascading.flow.Flow) Fields(cascading.tuple.Fields) LeftJoin(cascading.pipe.joiner.LeftJoin) Unique(cascading.pipe.assembly.Unique) TextDelimited(cascading.scheme.hadoop.TextDelimited)

Aggregations

Flow (cascading.flow.Flow)1 FlowDef (cascading.flow.FlowDef)1 HadoopFlowConnector (cascading.flow.hadoop.HadoopFlowConnector)1 Insert (cascading.operation.Insert)1 ExpressionFunction (cascading.operation.expression.ExpressionFunction)1 RegexFilter (cascading.operation.regex.RegexFilter)1 RegexSplitGenerator (cascading.operation.regex.RegexSplitGenerator)1 CoGroup (cascading.pipe.CoGroup)1 Each (cascading.pipe.Each)1 GroupBy (cascading.pipe.GroupBy)1 HashJoin (cascading.pipe.HashJoin)1 Pipe (cascading.pipe.Pipe)1 CountBy (cascading.pipe.assembly.CountBy)1 Rename (cascading.pipe.assembly.Rename)1 Retain (cascading.pipe.assembly.Retain)1 SumBy (cascading.pipe.assembly.SumBy)1 Unique (cascading.pipe.assembly.Unique)1 LeftJoin (cascading.pipe.joiner.LeftJoin)1 TextDelimited (cascading.scheme.hadoop.TextDelimited)1 Tap (cascading.tap.Tap)1