use of cascading.tuple.Fields in project SpyGlass by ParallelAI.
the class HBaseScheme method sink.
@Override
public void sink(FlowProcess<JobConf> flowProcess, SinkCall<Object[], OutputCollector> sinkCall) throws IOException {
TupleEntry tupleEntry = sinkCall.getOutgoingEntry();
OutputCollector outputCollector = sinkCall.getOutput();
Tuple key = tupleEntry.selectTuple(keyField);
ImmutableBytesWritable keyBytes = (ImmutableBytesWritable) key.getObject(0);
if (useSalt) {
keyBytes = HBaseSalter.addSaltPrefix(keyBytes);
}
Put put;
if (this.timeStamp == 0L) {
put = new Put(keyBytes.get());
} else {
put = new Put(keyBytes.get(), this.timeStamp);
}
for (int i = 0; i < valueFields.length; i++) {
Fields fieldSelector = valueFields[i];
TupleEntry values = tupleEntry.selectEntry(fieldSelector);
for (int j = 0; j < values.getFields().size(); j++) {
Fields fields = values.getFields();
Tuple tuple = values.getTuple();
ImmutableBytesWritable valueBytes = (ImmutableBytesWritable) tuple.getObject(j);
if (valueBytes != null)
put.add(Bytes.toBytes(familyNames[i]), Bytes.toBytes((String) fields.get(j)), valueBytes.get());
}
}
outputCollector.collect(null, put);
}
use of cascading.tuple.Fields in project Impatient by Cascading.
the class Main method main.
public static void main(String[] args) {
String docPath = args[0];
String wcPath = args[1];
String stopPath = args[2];
String tfidfPath = args[3];
String trapPath = args[4];
String checkPath = args[5];
Properties properties = new Properties();
AppProps.setApplicationJarClass(properties, Main.class);
AppProps.setApplicationName(properties, "Impatient Part 6");
AppProps.addApplicationTag(properties, "tutorial:impatient");
AppProps.addApplicationTag(properties, "technology:Cascading");
FlowConnector flowConnector = new Hadoop2MR1FlowConnector(properties);
// create source and sink taps
Tap docTap = new Hfs(new TextDelimited(true, "\t"), docPath);
Tap wcTap = new Hfs(new TextDelimited(true, "\t"), wcPath);
Fields stop = new Fields("stop");
Tap stopTap = new Hfs(new TextDelimited(stop, true, "\t"), stopPath);
Tap tfidfTap = new Hfs(new TextDelimited(true, "\t"), tfidfPath);
Tap trapTap = new Hfs(new TextDelimited(true, "\t"), trapPath);
Tap checkTap = new Hfs(new TextDelimited(true, "\t"), checkPath);
// use a stream assertion to validate the input data
Pipe docPipe = new Pipe("token");
AssertMatches assertMatches = new AssertMatches("doc\\d+\\s.*");
docPipe = new Each(docPipe, AssertionLevel.STRICT, assertMatches);
// specify a regex operation to split the "document" text lines into a token stream
Fields token = new Fields("token");
Fields text = new Fields("text");
RegexSplitGenerator splitter = new RegexSplitGenerator(token, "[ \\[\\]\\(\\),.]");
Fields fieldSelector = new Fields("doc_id", "token");
docPipe = new Each(docPipe, text, splitter, fieldSelector);
// define "ScrubFunction" to clean up the token stream
Fields scrubArguments = new Fields("doc_id", "token");
docPipe = new Each(docPipe, scrubArguments, new ScrubFunction(scrubArguments), Fields.RESULTS);
// perform a left join to remove stop words, discarding the rows
// which joined with stop words, i.e., were non-null after left join
Pipe stopPipe = new Pipe("stop");
Pipe tokenPipe = new HashJoin(docPipe, token, stopPipe, stop, new LeftJoin());
tokenPipe = new Each(tokenPipe, stop, new RegexFilter("^$"));
tokenPipe = new Retain(tokenPipe, fieldSelector);
// one branch of the flow tallies the token counts for term frequency (TF)
Pipe tfPipe = new Pipe("TF", tokenPipe);
Fields tf_count = new Fields("tf_count");
tfPipe = new CountBy(tfPipe, new Fields("doc_id", "token"), tf_count);
Fields tf_token = new Fields("tf_token");
tfPipe = new Rename(tfPipe, token, tf_token);
// one branch counts the number of documents (D)
Fields doc_id = new Fields("doc_id");
Fields tally = new Fields("tally");
Fields rhs_join = new Fields("rhs_join");
Fields n_docs = new Fields("n_docs");
Pipe dPipe = new Unique("D", tokenPipe, doc_id);
dPipe = new Each(dPipe, new Insert(tally, 1), Fields.ALL);
dPipe = new Each(dPipe, new Insert(rhs_join, 1), Fields.ALL);
dPipe = new SumBy(dPipe, rhs_join, tally, n_docs, long.class);
// one branch tallies the token counts for document frequency (DF)
Pipe dfPipe = new Unique("DF", tokenPipe, Fields.ALL);
Fields df_count = new Fields("df_count");
dfPipe = new CountBy(dfPipe, token, df_count);
Fields df_token = new Fields("df_token");
Fields lhs_join = new Fields("lhs_join");
dfPipe = new Rename(dfPipe, token, df_token);
dfPipe = new Each(dfPipe, new Insert(lhs_join, 1), Fields.ALL);
// example use of a debug, to observe tuple stream; turn off below
dfPipe = new Each(dfPipe, DebugLevel.VERBOSE, new Debug(true));
// join to bring together all the components for calculating TF-IDF
// the D side of the join is smaller, so it goes on the RHS
Pipe idfPipe = new HashJoin(dfPipe, lhs_join, dPipe, rhs_join);
// create a checkpoint, to observe the intermediate data in DF stream
Checkpoint idfCheck = new Checkpoint("checkpoint", idfPipe);
// the IDF side of the join is smaller, so it goes on the RHS
Pipe tfidfPipe = new CoGroup(tfPipe, tf_token, idfCheck, df_token);
// calculate the TF-IDF weights, per token, per document
Fields tfidf = new Fields("tfidf");
String expression = "(double) tf_count * Math.log( (double) n_docs / ( 1.0 + df_count ) )";
ExpressionFunction tfidfExpression = new ExpressionFunction(tfidf, expression, Double.class);
Fields tfidfArguments = new Fields("tf_count", "df_count", "n_docs");
tfidfPipe = new Each(tfidfPipe, tfidfArguments, tfidfExpression, Fields.ALL);
fieldSelector = new Fields("tf_token", "doc_id", "tfidf");
tfidfPipe = new Retain(tfidfPipe, fieldSelector);
tfidfPipe = new Rename(tfidfPipe, tf_token, token);
// keep track of the word counts, which are useful for QA
Pipe wcPipe = new Pipe("wc", tfPipe);
Fields count = new Fields("count");
wcPipe = new SumBy(wcPipe, tf_token, tf_count, count, long.class);
wcPipe = new Rename(wcPipe, tf_token, token);
// additionally, sort by count
wcPipe = new GroupBy(wcPipe, count, count);
// connect the taps, pipes, traps, checkpoints, etc., into a flow
FlowDef flowDef = FlowDef.flowDef().setName("tfidf").addSource(docPipe, docTap).addSource(stopPipe, stopTap).addTailSink(tfidfPipe, tfidfTap).addTailSink(wcPipe, wcTap).addTrap(docPipe, trapTap).addCheckpoint(idfCheck, checkTap);
// set to DebugLevel.VERBOSE for trace, or DebugLevel.NONE in production
flowDef.setDebugLevel(DebugLevel.VERBOSE);
// set to AssertionLevel.STRICT for all assertions, or AssertionLevel.NONE in production
flowDef.setAssertionLevel(AssertionLevel.STRICT);
// write a DOT file and run the flow
Flow tfidfFlow = flowConnector.connect(flowDef);
tfidfFlow.writeDOT("dot/tfidf.dot");
tfidfFlow.complete();
}
use of cascading.tuple.Fields in project Impatient by Cascading.
the class ScrubTest method testScrub.
@Test
public void testScrub() {
Fields fieldDeclaration = new Fields("doc_id", "token");
Function scrub = new ScrubFunction(fieldDeclaration);
Tuple[] arguments = new Tuple[] { // will be scrubed
new Tuple("doc_1", "FoO"), // will be scrubed
new Tuple("doc_1", " BAR "), // will be scrubed
new Tuple("doc_1", " ") };
ArrayList<Tuple> expectResults = new ArrayList<Tuple>();
expectResults.add(new Tuple("doc_1", "foo"));
expectResults.add(new Tuple("doc_1", "bar"));
TupleListCollector collector = invokeFunction(scrub, arguments, Fields.ALL);
Iterator<Tuple> it = collector.iterator();
ArrayList<Tuple> results = new ArrayList<Tuple>();
while (it.hasNext()) results.add(it.next());
assertEquals("Scrub result is not expected", expectResults, results);
}
use of cascading.tuple.Fields in project SpyGlass by ParallelAI.
the class HBaseScheme method source.
@Override
public boolean source(FlowProcess<JobConf> flowProcess, SourceCall<Object[], RecordReader> sourceCall) throws IOException {
Tuple result = new Tuple();
Object key = sourceCall.getContext()[0];
Object value = sourceCall.getContext()[1];
boolean hasNext = sourceCall.getInput().next(key, value);
if (!hasNext) {
return false;
}
// Skip nulls
if (key == null || value == null) {
return true;
}
ImmutableBytesWritable keyWritable = (ImmutableBytesWritable) key;
Result row = (Result) value;
result.add(keyWritable);
for (int i = 0; i < this.familyNames.length; i++) {
String familyName = this.familyNames[i];
byte[] familyNameBytes = Bytes.toBytes(familyName);
Fields fields = this.valueFields[i];
for (int k = 0; k < fields.size(); k++) {
String fieldName = (String) fields.get(k);
byte[] fieldNameBytes = Bytes.toBytes(fieldName);
byte[] cellValue = row.getValue(familyNameBytes, fieldNameBytes);
result.add(cellValue != null ? new ImmutableBytesWritable(cellValue) : null);
}
}
sourceCall.getIncomingEntry().setTuple(result);
return true;
}
use of cascading.tuple.Fields in project SpyGlass by ParallelAI.
the class HBaseScheme method columns.
private String[] columns(String[] familyNames, Fields[] fieldsArray) {
if (columns != null) {
return columns;
}
int size = 0;
for (Fields fields : fieldsArray) {
size += fields.size();
}
columns = new String[size];
int count = 0;
for (int i = 0; i < fieldsArray.length; i++) {
Fields fields = fieldsArray[i];
for (int j = 0; j < fields.size(); j++) {
if (familyNames == null) {
columns[count++] = hbaseColumn((String) fields.get(j));
} else {
columns[count++] = hbaseColumn(familyNames[i]) + (String) fields.get(j);
}
}
}
return columns;
}
Aggregations