Search in sources :

Example 1 with DataFrame

use of org.apache.spark.sql.DataFrame in project camel by apache.

the class SparkProducerTest method createRegistry.

// Routes fixtures
@Override
protected JndiRegistry createRegistry() throws Exception {
    JndiRegistry registry = super.createRegistry();
    registry.bind("testFileRdd", sparkContext.textFile("src/test/resources/testrdd.txt"));
    if (shouldRunHive) {
        registry.bind("hiveContext", hiveContext);
        DataFrame jsonCars = hiveContext.read().json("src/test/resources/cars.json");
        jsonCars.registerTempTable("cars");
        registry.bind("jsonCars", jsonCars);
    }
    registry.bind("countLinesTransformation", new org.apache.camel.component.spark.RddCallback() {

        @Override
        public Object onRdd(JavaRDDLike rdd, Object... payloads) {
            return rdd.count();
        }
    });
    return registry;
}
Also used : JndiRegistry(org.apache.camel.impl.JndiRegistry) JavaRDDLike(org.apache.spark.api.java.JavaRDDLike) DataFrame(org.apache.spark.sql.DataFrame)

Example 2 with DataFrame

use of org.apache.spark.sql.DataFrame in project camel by apache.

the class DataFrameSparkProducer method process.

@Override
public void process(Exchange exchange) throws Exception {
    DataFrame dataFrame = resolveDataFrame(exchange);
    DataFrameCallback dataFrameCallback = resolveDataFrameCallback(exchange);
    Object body = exchange.getIn().getBody();
    Object result = body instanceof List ? dataFrameCallback.onDataFrame(dataFrame, ((List) body).toArray(new Object[0])) : dataFrameCallback.onDataFrame(dataFrame, body);
    collectResults(exchange, result);
}
Also used : List(java.util.List) DataFrame(org.apache.spark.sql.DataFrame)

Example 3 with DataFrame

use of org.apache.spark.sql.DataFrame in project camel by apache.

the class HiveSparkProducer method process.

@Override
public void process(Exchange exchange) throws Exception {
    HiveContext hiveContext = resolveHiveContext();
    String sql = exchange.getIn().getBody(String.class);
    DataFrame resultFrame = hiveContext.sql(sql);
    exchange.getIn().setBody(getEndpoint().isCollect() ? resultFrame.collectAsList() : resultFrame.count());
}
Also used : DataFrame(org.apache.spark.sql.DataFrame) HiveContext(org.apache.spark.sql.hive.HiveContext)

Example 4 with DataFrame

use of org.apache.spark.sql.DataFrame in project geode by apache.

the class OQLJavaDemo method main.

public static void main(String[] argv) {
    if (argv.length != 1) {
        System.err.printf("Usage: OQLJavaDemo <locators>\n");
        return;
    }
    SparkConf conf = new SparkConf().setAppName("OQLJavaDemo");
    // "192.168.1.47[10335]"
    conf.set(GeodeLocatorPropKey, argv[0]);
    JavaSparkContext sc = new JavaSparkContext(conf);
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
    DataFrame df = javaFunctions(sqlContext).geodeOQL("select * from /str_str_region");
    System.out.println("======= DataFrame =======\n");
    df.show();
    sc.stop();
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DataFrame(org.apache.spark.sql.DataFrame) SparkConf(org.apache.spark.SparkConf) SQLContext(org.apache.spark.sql.SQLContext)

Example 5 with DataFrame

use of org.apache.spark.sql.DataFrame in project learning-spark by databricks.

the class SparkSQLTwitter method main.

public static void main(String[] args) {
    String inputFile = args[0];
    SparkConf conf = new SparkConf();
    JavaSparkContext sc = new JavaSparkContext(conf);
    SQLContext sqlCtx = new SQLContext(sc);
    DataFrame input = sqlCtx.jsonFile(inputFile);
    // Print the schema
    input.printSchema();
    // Register the input schema RDD
    input.registerTempTable("tweets");
    // Select tweets based on the retweetCount
    DataFrame topTweets = sqlCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10");
    Row[] result = topTweets.collect();
    for (Row row : result) {
        System.out.println(row.get(0));
    }
    JavaRDD<String> topTweetText = topTweets.toJavaRDD().map(new Function<Row, String>() {

        public String call(Row row) {
            return row.getString(0);
        }
    });
    System.out.println(topTweetText.collect());
    // Create a person and turn it into a Schema RDD
    ArrayList<HappyPerson> peopleList = new ArrayList<HappyPerson>();
    peopleList.add(new HappyPerson("holden", "coffee"));
    JavaRDD<HappyPerson> happyPeopleRDD = sc.parallelize(peopleList);
    DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class);
    happyPeopleSchemaRDD.registerTempTable("happy_people");
    sqlCtx.udf().register("stringLengthJava", new UDF1<String, Integer>() {

        @Override
        public Integer call(String str) throws Exception {
            return str.length();
        }
    }, DataTypes.IntegerType);
    DataFrame tweetLength = sqlCtx.sql("SELECT stringLengthJava('text') FROM tweets LIMIT 10");
    Row[] lengths = tweetLength.collect();
    for (Row row : result) {
        System.out.println(row.get(0));
    }
    sc.stop();
}
Also used : ArrayList(java.util.ArrayList) DataFrame(org.apache.spark.sql.DataFrame) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf) SQLContext(org.apache.spark.sql.SQLContext)

Aggregations

DataFrame (org.apache.spark.sql.DataFrame)8 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)5 SQLContext (org.apache.spark.sql.SQLContext)5 SparkConf (org.apache.spark.SparkConf)3 ArrayList (java.util.ArrayList)1 List (java.util.List)1 JndiRegistry (org.apache.camel.impl.JndiRegistry)1 Configuration (org.apache.hadoop.conf.Configuration)1 JavaRDDLike (org.apache.spark.api.java.JavaRDDLike)1 Row (org.apache.spark.sql.Row)1 HiveContext (org.apache.spark.sql.hive.HiveContext)1 BSONObject (org.bson.BSONObject)1 Tuple2 (scala.Tuple2)1