use of org.apache.spark.sql.DataFrame in project camel by apache.
the class SparkProducerTest method createRegistry.
// Routes fixtures
@Override
protected JndiRegistry createRegistry() throws Exception {
JndiRegistry registry = super.createRegistry();
registry.bind("testFileRdd", sparkContext.textFile("src/test/resources/testrdd.txt"));
if (shouldRunHive) {
registry.bind("hiveContext", hiveContext);
DataFrame jsonCars = hiveContext.read().json("src/test/resources/cars.json");
jsonCars.registerTempTable("cars");
registry.bind("jsonCars", jsonCars);
}
registry.bind("countLinesTransformation", new org.apache.camel.component.spark.RddCallback() {
@Override
public Object onRdd(JavaRDDLike rdd, Object... payloads) {
return rdd.count();
}
});
return registry;
}
use of org.apache.spark.sql.DataFrame in project camel by apache.
the class DataFrameSparkProducer method process.
@Override
public void process(Exchange exchange) throws Exception {
DataFrame dataFrame = resolveDataFrame(exchange);
DataFrameCallback dataFrameCallback = resolveDataFrameCallback(exchange);
Object body = exchange.getIn().getBody();
Object result = body instanceof List ? dataFrameCallback.onDataFrame(dataFrame, ((List) body).toArray(new Object[0])) : dataFrameCallback.onDataFrame(dataFrame, body);
collectResults(exchange, result);
}
use of org.apache.spark.sql.DataFrame in project camel by apache.
the class HiveSparkProducer method process.
@Override
public void process(Exchange exchange) throws Exception {
HiveContext hiveContext = resolveHiveContext();
String sql = exchange.getIn().getBody(String.class);
DataFrame resultFrame = hiveContext.sql(sql);
exchange.getIn().setBody(getEndpoint().isCollect() ? resultFrame.collectAsList() : resultFrame.count());
}
use of org.apache.spark.sql.DataFrame in project geode by apache.
the class OQLJavaDemo method main.
public static void main(String[] argv) {
if (argv.length != 1) {
System.err.printf("Usage: OQLJavaDemo <locators>\n");
return;
}
SparkConf conf = new SparkConf().setAppName("OQLJavaDemo");
// "192.168.1.47[10335]"
conf.set(GeodeLocatorPropKey, argv[0]);
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
DataFrame df = javaFunctions(sqlContext).geodeOQL("select * from /str_str_region");
System.out.println("======= DataFrame =======\n");
df.show();
sc.stop();
}
use of org.apache.spark.sql.DataFrame in project learning-spark by databricks.
the class SparkSQLTwitter method main.
public static void main(String[] args) {
String inputFile = args[0];
SparkConf conf = new SparkConf();
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlCtx = new SQLContext(sc);
DataFrame input = sqlCtx.jsonFile(inputFile);
// Print the schema
input.printSchema();
// Register the input schema RDD
input.registerTempTable("tweets");
// Select tweets based on the retweetCount
DataFrame topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10");
Row[] result = topTweets.collect();
for (Row row : result) {
System.out.println(row.get(0));
}
JavaRDD<String> topTweetText = topTweets.toJavaRDD().map(new Function<Row, String>() {
public String call(Row row) {
return row.getString(0);
}
});
System.out.println(topTweetText.collect());
// Create a person and turn it into a Schema RDD
ArrayList<HappyPerson> peopleList = new ArrayList<HappyPerson>();
peopleList.add(new HappyPerson("holden", "coffee"));
JavaRDD<HappyPerson> happyPeopleRDD = sc.parallelize(peopleList);
DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class);
happyPeopleSchemaRDD.registerTempTable("happy_people");
sqlCtx.udf().register("stringLengthJava", new UDF1<String, Integer>() {
@Override
public Integer call(String str) throws Exception {
return str.length();
}
}, DataTypes.IntegerType);
DataFrame tweetLength = sqlCtx.sql("SELECT stringLengthJava('text') FROM tweets LIMIT 10");
Row[] lengths = tweetLength.collect();
for (Row row : result) {
System.out.println(row.get(0));
}
sc.stop();
}
Aggregations