use of co.cask.cdap.api.data.format.FormatSpecification in project cdap by caskdata.
the class StreamFormatSpecSpark method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
SQLContext sqlContext = new SQLContext(jsc);
// Read from CSV stream and turn it into a DataFrame
String streamName = sec.getRuntimeArguments().get("stream.name");
Schema schema = Schema.recordOf("record", ImmutableList.of(Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.INT))));
FormatSpecification formatSpec = new FormatSpecification("csv", schema);
JavaPairRDD<Long, GenericStreamEventData<StructuredRecord>> rdd = sec.fromStream(streamName, formatSpec, StructuredRecord.class);
JavaRDD<Person> personRDD = rdd.values().map(new Function<GenericStreamEventData<StructuredRecord>, Person>() {
@Override
public Person call(GenericStreamEventData<StructuredRecord> data) throws Exception {
StructuredRecord record = data.getBody();
return new Person(record.<String>get("name"), record.<Integer>get("age"));
}
});
sqlContext.createDataFrame(personRDD, Person.class).registerTempTable("people");
// Execute a SQL on the table and save the result
JavaPairRDD<String, Integer> resultRDD = sqlContext.sql(sec.getRuntimeArguments().get("sql.statement")).toJavaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<>(row.getString(0), row.getInt(1));
}
});
sec.saveAsDataset(resultRDD, sec.getRuntimeArguments().get("output.dataset"));
}
Aggregations