Examples with Fields - cascading.tuple.Fields

Example 1 with Fields

use of cascading.tuple.Fields in project SpyGlass by ParallelAI.

the class HBaseScheme method sink.

@Override
public void sink(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
    TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
    OutputCollector outputCollector = sinkCall.getOutput();
    Tuple key = tupleEntry.selectTuple(keyField);
    ImmutableBytesWritable keyBytes = (ImmutableBytesWritable) key.getObject(0);
    if (useSalt) {
        keyBytes = HBaseSalter.addSaltPrefix(keyBytes);
    }
    Put put;
    if (this.timeStamp == 0L) {
        put = new Put(keyBytes.get());
    } else {
        put = new Put(keyBytes.get(), this.timeStamp);
    }
    for (int i = 0; i < valueFields.length; i++) {
        Fields fieldSelector = valueFields[i];
        TupleEntry values = tupleEntry.selectEntry(fieldSelector);
        for (int j = 0; j < values.getFields().size(); j++) {
            Fields fields = values.getFields();
            Tuple tuple = values.getTuple();
            ImmutableBytesWritable valueBytes = (ImmutableBytesWritable) tuple.getObject(j);
            if (valueBytes != null)
                put.add(Bytes.toBytes(familyNames[i]), Bytes.toBytes((String) fields.get(j)), valueBytes.get());
        }
    }
    outputCollector.collect(null, put);
}

Also used : OutputCollector(org.apache.hadoop.mapred.OutputCollector) ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) Fields(cascading.tuple.Fields) TupleEntry(cascading.tuple.TupleEntry) Tuple(cascading.tuple.Tuple) Put(org.apache.hadoop.hbase.client.Put)

Example 2 with Fields

use of cascading.tuple.Fields in project Impatient by Cascading.

the class Main method main.

public static void main(String[] args) {
    String docPath = args[0];
    String wcPath = args[1];
    String stopPath = args[2];
    String tfidfPath = args[3];
    String trapPath = args[4];
    String checkPath = args[5];
    Properties properties = new Properties();
    AppProps.setApplicationJarClass(properties, Main.class);
    AppProps.setApplicationName(properties, "Impatient Part 6");
    AppProps.addApplicationTag(properties, "tutorial:impatient");
    AppProps.addApplicationTag(properties, "technology:Cascading");
    FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties);
    // create source and sink taps
    Tap docTap = new Hfs(new TextDelimited(true, "\t"), docPath);
    Tap wcTap = new Hfs(new TextDelimited(true, "\t"), wcPath);
    Fields stop = new Fields("stop");
    Tap stopTap = new Hfs(new TextDelimited(stop, true, "\t"), stopPath);
    Tap tfidfTap = new Hfs(new TextDelimited(true, "\t"), tfidfPath);
    Tap trapTap = new Hfs(new TextDelimited(true, "\t"), trapPath);
    Tap checkTap = new Hfs(new TextDelimited(true, "\t"), checkPath);
    // use a stream assertion to validate the input data
    Pipe docPipe = new Pipe("token");
    AssertMatches assertMatches = new AssertMatches("doc\\d+\\s.*");
    docPipe = new Each(docPipe, AssertionLevel.STRICT, assertMatches);
    // specify a regex operation to split the "document" text lines into a token stream
    Fields token = new Fields("token");
    Fields text = new Fields("text");
    RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\\(\\),.]");
    Fields fieldSelector = new Fields("doc_id", "token");
    docPipe = new Each(docPipe, text, splitter, fieldSelector);
    // define "ScrubFunction" to clean up the token stream
    Fields scrubArguments = new Fields("doc_id", "token");
    docPipe = new Each(docPipe, scrubArguments, new ScrubFunction(scrubArguments), Fields.RESULTS);
    // perform a left join to remove stop words, discarding the rows
    // which joined with stop words, i.e., were non-null after left join
    Pipe stopPipe = new Pipe("stop");
    Pipe tokenPipe = new HashJoin(docPipe, token, stopPipe, stop, new LeftJoin());
    tokenPipe = new Each(tokenPipe, stop, new RegexFilter("^$"));
    tokenPipe = new Retain(tokenPipe, fieldSelector);
    // one branch of the flow tallies the token counts for term frequency (TF)
    Pipe tfPipe = new Pipe("TF", tokenPipe);
    Fields tf_count = new Fields("tf_count");
    tfPipe = new CountBy(tfPipe, new Fields("doc_id", "token"), tf_count);
    Fields tf_token = new Fields("tf_token");
    tfPipe = new Rename(tfPipe, token, tf_token);
    // one branch counts the number of documents (D)
    Fields doc_id = new Fields("doc_id");
    Fields tally = new Fields("tally");
    Fields rhs_join = new Fields("rhs_join");
    Fields n_docs = new Fields("n_docs");
    Pipe dPipe = new Unique("D", tokenPipe, doc_id);
    dPipe = new Each(dPipe, new Insert(tally, 1), Fields.ALL);
    dPipe = new Each(dPipe, new Insert(rhs_join, 1), Fields.ALL);
    dPipe = new SumBy(dPipe, rhs_join, tally, n_docs, long.class);
    // one branch tallies the token counts for document frequency (DF)
    Pipe dfPipe = new Unique("DF", tokenPipe, Fields.ALL);
    Fields df_count = new Fields("df_count");
    dfPipe = new CountBy(dfPipe, token, df_count);
    Fields df_token = new Fields("df_token");
    Fields lhs_join = new Fields("lhs_join");
    dfPipe = new Rename(dfPipe, token, df_token);
    dfPipe = new Each(dfPipe, new Insert(lhs_join, 1), Fields.ALL);
    // example use of a debug, to observe tuple stream; turn off below
    dfPipe = new Each(dfPipe, DebugLevel.VERBOSE, new Debug(true));
    // join to bring together all the components for calculating TF-IDF
    // the D side of the join is smaller, so it goes on the RHS
    Pipe idfPipe = new HashJoin(dfPipe, lhs_join, dPipe, rhs_join);
    // create a checkpoint, to observe the intermediate data in DF stream
    Checkpoint idfCheck = new Checkpoint("checkpoint", idfPipe);
    // the IDF side of the join is smaller, so it goes on the RHS
    Pipe tfidfPipe = new CoGroup(tfPipe, tf_token, idfCheck, df_token);
    // calculate the TF-IDF weights, per token, per document
    Fields tfidf = new Fields("tfidf");
    String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
    ExpressionFunction tfidfExpression = new ExpressionFunction(tfidf, expression, Double.class);
    Fields tfidfArguments = new Fields("tf_count", "df_count", "n_docs");
    tfidfPipe = new Each(tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL);
    fieldSelector = new Fields("tf_token", "doc_id", "tfidf");
    tfidfPipe = new Retain(tfidfPipe, fieldSelector);
    tfidfPipe = new Rename(tfidfPipe, tf_token, token);
    // keep track of the word counts, which are useful for QA
    Pipe wcPipe = new Pipe("wc", tfPipe);
    Fields count = new Fields("count");
    wcPipe = new SumBy(wcPipe, tf_token, tf_count, count, long.class);
    wcPipe = new Rename(wcPipe, tf_token, token);
    // additionally, sort by count
    wcPipe = new GroupBy(wcPipe, count, count);
    // connect the taps, pipes, traps, checkpoints, etc., into a flow
    FlowDef flowDef = FlowDef.flowDef().setName("tfidf").addSource(docPipe, docTap).addSource(stopPipe, stopTap).addTailSink(tfidfPipe, tfidfTap).addTailSink(wcPipe, wcTap).addTrap(docPipe, trapTap).addCheckpoint(idfCheck, checkTap);
    // set to DebugLevel.VERBOSE for trace, or DebugLevel.NONE in production
    flowDef.setDebugLevel(DebugLevel.VERBOSE);
    // set to AssertionLevel.STRICT for all assertions, or AssertionLevel.NONE in production
    flowDef.setAssertionLevel(AssertionLevel.STRICT);
    // write a DOT file and run the flow
    Flow tfidfFlow = flowConnector.connect(flowDef);
    tfidfFlow.writeDOT("dot/tfidf.dot");
    tfidfFlow.complete();
}

Also used : Each(cascading.pipe.Each) Tap(cascading.tap.Tap) Hadoop2MR1FlowConnector(cascading.flow.hadoop2.Hadoop2MR1FlowConnector) FlowDef(cascading.flow.FlowDef) Properties(java.util.Properties) RegexFilter(cascading.operation.regex.RegexFilter) Insert(cascading.operation.Insert) AssertMatches(cascading.operation.assertion.AssertMatches) RegexSplitGenerator(cascading.operation.regex.RegexSplitGenerator) HashJoin(cascading.pipe.HashJoin) Hfs(cascading.tap.hadoop.Hfs) Retain(cascading.pipe.assembly.Retain) CoGroup(cascading.pipe.CoGroup) Debug(cascading.operation.Debug) Hadoop2MR1FlowConnector(cascading.flow.hadoop2.Hadoop2MR1FlowConnector) FlowConnector(cascading.flow.FlowConnector) GroupBy(cascading.pipe.GroupBy) CountBy(cascading.pipe.assembly.CountBy) ExpressionFunction(cascading.operation.expression.ExpressionFunction) SumBy(cascading.pipe.assembly.SumBy) Pipe(cascading.pipe.Pipe) Rename(cascading.pipe.assembly.Rename) Flow(cascading.flow.Flow) Checkpoint(cascading.pipe.Checkpoint) Fields(cascading.tuple.Fields) LeftJoin(cascading.pipe.joiner.LeftJoin) Unique(cascading.pipe.assembly.Unique) TextDelimited(cascading.scheme.hadoop.TextDelimited)

Example 3 with Fields

use of cascading.tuple.Fields in project Impatient by Cascading.

the class ScrubTest method testScrub.

@Test
public void testScrub() {
    Fields fieldDeclaration = new Fields("doc_id", "token");
    Function scrub = new ScrubFunction(fieldDeclaration);
    Tuple[] arguments = new Tuple[] { // will be scrubed
    new Tuple("doc_1", "FoO"), // will be scrubed
    new Tuple("doc_1", " BAR "), // will be scrubed
    new Tuple("doc_1", "     ") };
    ArrayList<Tuple> expectResults = new ArrayList<Tuple>();
    expectResults.add(new Tuple("doc_1", "foo"));
    expectResults.add(new Tuple("doc_1", "bar"));
    TupleListCollector collector = invokeFunction(scrub, arguments, Fields.ALL);
    Iterator<Tuple> it = collector.iterator();
    ArrayList<Tuple> results = new ArrayList<Tuple>();
    while (it.hasNext()) results.add(it.next());
    assertEquals("Scrub result is not expected", expectResults, results);
}

Also used : Function(cascading.operation.Function) Fields(cascading.tuple.Fields) ArrayList(java.util.ArrayList) TupleListCollector(cascading.tuple.TupleListCollector) Tuple(cascading.tuple.Tuple) Test(org.junit.Test)

Example 4 with Fields

use of cascading.tuple.Fields in project SpyGlass by ParallelAI.

the class HBaseScheme method source.

@Override
public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException {
    Tuple result = new Tuple();
    Object key = sourceCall.getContext()[0];
    Object value = sourceCall.getContext()[1];
    boolean hasNext = sourceCall.getInput().next(key, value);
    if (!hasNext) {
        return false;
    }
    // Skip nulls
    if (key == null || value == null) {
        return true;
    }
    ImmutableBytesWritable keyWritable = (ImmutableBytesWritable) key;
    Result row = (Result) value;
    result.add(keyWritable);
    for (int i = 0; i < this.familyNames.length; i++) {
        String familyName = this.familyNames[i];
        byte[] familyNameBytes = Bytes.toBytes(familyName);
        Fields fields = this.valueFields[i];
        for (int k = 0; k < fields.size(); k++) {
            String fieldName = (String) fields.get(k);
            byte[] fieldNameBytes = Bytes.toBytes(fieldName);
            byte[] cellValue = row.getValue(familyNameBytes, fieldNameBytes);
            result.add(cellValue != null ? new ImmutableBytesWritable(cellValue) : null);
        }
    }
    sourceCall.getIncomingEntry().setTuple(result);
    return true;
}

Also used : ImmutableBytesWritable(org.apache.hadoop.hbase.io.ImmutableBytesWritable) Fields(cascading.tuple.Fields) Tuple(cascading.tuple.Tuple) Result(org.apache.hadoop.hbase.client.Result)

Example 5 with Fields

use of cascading.tuple.Fields in project SpyGlass by ParallelAI.

the class HBaseScheme method columns.

private String[] columns(String[] familyNames, Fields[] fieldsArray) {
    if (columns != null) {
        return columns;
    }
    int size = 0;
    for (Fields fields : fieldsArray) {
        size += fields.size();
    }
    columns = new String[size];
    int count = 0;
    for (int i = 0; i < fieldsArray.length; i++) {
        Fields fields = fieldsArray[i];
        for (int j = 0; j < fields.size(); j++) {
            if (familyNames == null) {
                columns[count++] = hbaseColumn((String) fields.get(j));
            } else {
                columns[count++] = hbaseColumn(familyNames[i]) + (String) fields.get(j);
            }
        }
    }
    return columns;
}

Also used : Fields(cascading.tuple.Fields)

Aggregations

Fields (cascading.tuple.Fields)9 Tuple (cascading.tuple.Tuple)4 ImmutableBytesWritable (org.apache.hadoop.hbase.io.ImmutableBytesWritable)3 Flow (cascading.flow.Flow)2 FlowDef (cascading.flow.FlowDef)2 Insert (cascading.operation.Insert)2 ExpressionFunction (cascading.operation.expression.ExpressionFunction)2 RegexFilter (cascading.operation.regex.RegexFilter)2 RegexSplitGenerator (cascading.operation.regex.RegexSplitGenerator)2 CoGroup (cascading.pipe.CoGroup)2 Each (cascading.pipe.Each)2 GroupBy (cascading.pipe.GroupBy)2 HashJoin (cascading.pipe.HashJoin)2 Pipe (cascading.pipe.Pipe)2 CountBy (cascading.pipe.assembly.CountBy)2 Rename (cascading.pipe.assembly.Rename)2 Retain (cascading.pipe.assembly.Retain)2 SumBy (cascading.pipe.assembly.SumBy)2 Unique (cascading.pipe.assembly.Unique)2 LeftJoin (cascading.pipe.joiner.LeftJoin)2