use of org.apache.spark.sql.DataFrame in project learning-spark by databricks.
the class LoadHive method main.
public static void main(String[] args) throws Exception {
if (args.length != 3) {
throw new Exception("Usage LoadHive sparkMaster tbl");
}
String master = args[0];
String tbl = args[1];
JavaSparkContext sc = new JavaSparkContext(master, "loadhive", System.getenv("SPARK_HOME"), System.getenv("JARS"));
SQLContext sqlCtx = new SQLContext(sc);
DataFrame rdd = sqlCtx.sql("SELECT key, value FROM src");
JavaRDD<Integer> squaredKeys = rdd.toJavaRDD().map(new SquareKey());
List<Integer> result = squaredKeys.collect();
for (Integer elem : result) {
System.out.println(elem);
}
}
use of org.apache.spark.sql.DataFrame in project learning-spark by databricks.
the class LoadJsonWithSparkSQL method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
throw new Exception("Usage LoadJsonWithSparkSQL sparkMaster jsonFile");
}
String master = args[0];
String jsonFile = args[1];
JavaSparkContext sc = new JavaSparkContext(master, "loadJsonwithsparksql");
SQLContext sqlCtx = new SQLContext(sc);
DataFrame input = sqlCtx.jsonFile(jsonFile);
input.printSchema();
}
use of org.apache.spark.sql.DataFrame in project mongo-hadoop by mongodb.
the class DataframeExample method run.
public void run() {
JavaSparkContext sc = new JavaSparkContext(new SparkConf());
// Set configuration options for the MongoDB Hadoop Connector.
Configuration mongodbConfig = new Configuration();
// MongoInputFormat allows us to read from a live MongoDB instance.
// We could also use BSONFileInputFormat to read BSON snapshots.
mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat");
// MongoDB connection string naming a collection to use.
// If using BSON, use "mapred.input.dir" to configure the directory
// where BSON files are located instead.
mongodbConfig.set("mongo.input.uri", "mongodb://localhost:27017/enron_mail.messages");
// Create an RDD backed by the MongoDB collection.
JavaPairRDD<Object, BSONObject> documents = sc.newAPIHadoopRDD(// Configuration
mongodbConfig, // InputFormat: read from a live cluster.
MongoInputFormat.class, // Key class
Object.class, // Value class
BSONObject.class);
JavaRDD<Message> messages = documents.map(new Function<Tuple2<Object, BSONObject>, Message>() {
public Message call(final Tuple2<Object, BSONObject> tuple) {
Message m = new Message();
BSONObject header = (BSONObject) tuple._2().get("headers");
m.setTo((String) header.get("To"));
m.setxFrom((String) header.get("From"));
m.setMessageID((String) header.get("Message-ID"));
m.setBody((String) tuple._2().get("body"));
return m;
}
});
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
DataFrame messagesSchema = sqlContext.createDataFrame(messages, Message.class);
messagesSchema.registerTempTable("messages");
DataFrame ericsMessages = sqlContext.sql("SELECT to, body FROM messages WHERE to = \"eric.bass@enron.com\"");
ericsMessages.show();
messagesSchema.printSchema();
}
Aggregations