use of org.bson.BSONObject in project mongo-hadoop by mongodb.
the class BSONLoader method next.
public BSONObject next() {
try {
byte[] data = new byte[nextLen + 4];
System.arraycopy(nextHdr, 0, data, 0, 4);
input.readFully(data, 4, nextLen - 4);
decoder.decode(data, callback);
return (BSONObject) callback.get();
} catch (IOException e) {
/* If we can't read another length it's not an error, just return quietly. */
LOG.info("No Length Header available." + e);
hasMore.set(false);
try {
input.close();
} catch (IOException e1) {
LOG.warn(e1.getMessage(), e1);
}
throw new NoSuchElementException("Iteration completed.");
}
}
use of org.bson.BSONObject in project mongo-hadoop by mongodb.
the class Enron method run.
public void run() {
JavaSparkContext sc = new JavaSparkContext(new SparkConf());
// Set configuration options for the MongoDB Hadoop Connector.
Configuration mongodbConfig = new Configuration();
// MongoInputFormat allows us to read from a live MongoDB instance.
// We could also use BSONFileInputFormat to read BSON snapshots.
mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat");
// MongoDB connection string naming a collection to use.
// If using BSON, use "mapred.input.dir" to configure the directory
// where BSON files are located instead.
mongodbConfig.set("mongo.input.uri", "mongodb://localhost:27017/enron_mail.messages");
// Create an RDD backed by the MongoDB collection.
JavaPairRDD<Object, BSONObject> documents = sc.newAPIHadoopRDD(// Configuration
mongodbConfig, // InputFormat: read from a live cluster.
MongoInputFormat.class, // Key class
Object.class, // Value class
BSONObject.class);
JavaRDD<String> edges = documents.flatMap(new FlatMapFunction<Tuple2<Object, BSONObject>, String>() {
@Override
public Iterable<String> call(final Tuple2<Object, BSONObject> t) throws Exception {
BSONObject header = (BSONObject) t._2().get("headers");
String to = (String) header.get("To");
String from = (String) header.get("From");
// each tuple in the set is an individual from|to pair
//JavaPairRDD<String, Integer> tuples = new JavaPairRDD<String, Integer>();
List<String> tuples = new ArrayList<String>();
if (to != null && !to.isEmpty()) {
for (String recipient : to.split(",")) {
String s = recipient.trim();
if (s.length() > 0) {
tuples.add(from + "|" + s);
}
}
}
return tuples;
}
});
JavaPairRDD<String, Integer> pairs = edges.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(final String s) {
return new Tuple2<String, Integer>(s, 1);
}
});
JavaPairRDD<String, Integer> counts = pairs.reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(final Integer a, final Integer b) {
return a + b;
}
});
// Create a separate Configuration for saving data back to MongoDB.
Configuration outputConfig = new Configuration();
outputConfig.set("mongo.output.uri", "mongodb://localhost:27017/enron_mail.message_pairs");
// Save this RDD as a Hadoop "file".
// The path argument is unused; all documents will go to 'mongo.output.uri'.
counts.saveAsNewAPIHadoopFile("file:///this-is-completely-unused", Object.class, BSONObject.class, MongoOutputFormat.class, outputConfig);
}
use of org.bson.BSONObject in project mongo-hadoop by mongodb.
the class EnronMailMapper method map.
@Override
public void map(final Object key, final BSONObject val, final Context context) throws IOException, InterruptedException {
BSONObject headers = (BSONObject) val.get("headers");
String to = (String) headers.get("To");
if (null != to) {
String[] recipients = to.split(",");
for (final String recip1 : recipients) {
String recip = recip1.trim();
if (recip.length() > 0) {
mp.setFrom((String) key);
mp.setTo(recip);
context.write(mp, intw);
}
}
}
}
use of org.bson.BSONObject in project mongo-hadoop by mongodb.
the class EnronMailMapper method map.
@Override
public void map(final Object key, final BSONWritable writable, final OutputCollector<MailPair, IntWritable> output, final Reporter reporter) throws IOException {
BSONObject headers = (BSONObject) writable.getDoc().get("headers");
String to = (String) headers.get("To");
String from = (String) headers.get("From");
if (null != to) {
String[] recipients = to.split(",");
for (final String recip1 : recipients) {
String recip = recip1.trim();
if (recip.length() > 0) {
mp.setFrom(from);
mp.setTo(recip);
output.collect(mp, intw);
}
}
}
}
use of org.bson.BSONObject in project mongo-hadoop by mongodb.
the class EnronMailReducer method reduce.
@Override
public void reduce(final MailPair pKey, final Iterable<IntWritable> pValues, final Context pContext) throws IOException, InterruptedException {
int sum = 0;
for (final IntWritable value : pValues) {
sum += value.get();
}
BSONObject outDoc = BasicDBObjectBuilder.start().add("f", pKey.getFrom()).add("t", pKey.getTo()).get();
reduceResult.setDoc(outDoc);
intw.set(sum);
pContext.write(reduceResult, intw);
}
Aggregations