use of org.apache.spark.sql.SQLContext in project Gaffer by gchq.
the class FilterToOperationConverterTest method testSpecifyDestination.
@Test
public void testSpecifyDestination() throws OperationException {
final Schema schema = getSchema();
final SQLContext sqlContext = getSqlContext("testSpecifyDestination");
final Filter[] filters = new Filter[1];
filters[0] = new EqualTo(SchemaToStructTypeConverter.DST_COL_NAME, "0");
final FiltersToOperationConverter converter = new FiltersToOperationConverter(sqlContext, getViewFromSchema(schema), schema, filters);
final AbstractGetRDD<?> operation = converter.getOperation();
assertTrue(operation instanceof GetRDDOfElements);
assertEquals(0, operation.getView().getEntityGroups().size());
assertEquals(EDGE_GROUPS, operation.getView().getEdgeGroups());
final Set<EntitySeed> seeds = new HashSet<>();
for (final Object seed : ((GetRDDOfElements) operation).getSeeds()) {
seeds.add((EntitySeed) seed);
}
assertEquals(Collections.singleton(new EntitySeed("0")), seeds);
sqlContext.sparkContext().stop();
}
use of org.apache.spark.sql.SQLContext in project Gaffer by gchq.
the class FilterToOperationConverterTest method testTwoGroups.
@Test
public void testTwoGroups() throws OperationException {
final Schema schema = getSchema();
final SQLContext sqlContext = getSqlContext("testTwoGroups");
final Filter[] filters = new Filter[1];
final Filter left = new EqualTo(SchemaToStructTypeConverter.GROUP, ENTITY_GROUP);
final Filter right = new EqualTo(SchemaToStructTypeConverter.GROUP, EDGE_GROUP2);
filters[0] = new Or(left, right);
final FiltersToOperationConverter converter = new FiltersToOperationConverter(sqlContext, getViewFromSchema(schema), schema, filters);
final AbstractGetRDD<?> operation = converter.getOperation();
assertTrue(operation instanceof GetRDDOfAllElements);
assertEquals(Collections.singleton(ENTITY_GROUP), operation.getView().getEntityGroups());
assertEquals(Collections.singleton(EDGE_GROUP2), operation.getView().getEdgeGroups());
sqlContext.sparkContext().stop();
}
use of org.apache.spark.sql.SQLContext in project mongo-hadoop by mongodb.
the class DataframeExample method run.
public void run() {
JavaSparkContext sc = new JavaSparkContext(new SparkConf());
// Set configuration options for the MongoDB Hadoop Connector.
Configuration mongodbConfig = new Configuration();
// MongoInputFormat allows us to read from a live MongoDB instance.
// We could also use BSONFileInputFormat to read BSON snapshots.
mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat");
// MongoDB connection string naming a collection to use.
// If using BSON, use "mapred.input.dir" to configure the directory
// where BSON files are located instead.
mongodbConfig.set("mongo.input.uri", "mongodb://localhost:27017/enron_mail.messages");
// Create an RDD backed by the MongoDB collection.
JavaPairRDD<Object, BSONObject> documents = sc.newAPIHadoopRDD(// Configuration
mongodbConfig, // InputFormat: read from a live cluster.
MongoInputFormat.class, // Key class
Object.class, // Value class
BSONObject.class);
JavaRDD<Message> messages = documents.map(new Function<Tuple2<Object, BSONObject>, Message>() {
public Message call(final Tuple2<Object, BSONObject> tuple) {
Message m = new Message();
BSONObject header = (BSONObject) tuple._2().get("headers");
m.setTo((String) header.get("To"));
m.setxFrom((String) header.get("From"));
m.setMessageID((String) header.get("Message-ID"));
m.setBody((String) tuple._2().get("body"));
return m;
}
});
SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
DataFrame messagesSchema = sqlContext.createDataFrame(messages, Message.class);
messagesSchema.registerTempTable("messages");
DataFrame ericsMessages = sqlContext.sql("SELECT to, body FROM messages WHERE to = \"eric.bass@enron.com\"");
ericsMessages.show();
messagesSchema.printSchema();
}
use of org.apache.spark.sql.SQLContext in project cdap by caskdata.
the class StreamFormatSpecSpark method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
SQLContext sqlContext = new SQLContext(jsc);
// Read from CSV stream and turn it into a DataFrame
String streamName = sec.getRuntimeArguments().get("stream.name");
Schema schema = Schema.recordOf("record", ImmutableList.of(Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.INT))));
FormatSpecification formatSpec = new FormatSpecification("csv", schema);
JavaPairRDD<Long, GenericStreamEventData<StructuredRecord>> rdd = sec.fromStream(streamName, formatSpec, StructuredRecord.class);
JavaRDD<Person> personRDD = rdd.values().map(new Function<GenericStreamEventData<StructuredRecord>, Person>() {
@Override
public Person call(GenericStreamEventData<StructuredRecord> data) throws Exception {
StructuredRecord record = data.getBody();
return new Person(record.<String>get("name"), record.<Integer>get("age"));
}
});
sqlContext.createDataFrame(personRDD, Person.class).registerTempTable("people");
// Execute a SQL on the table and save the result
JavaPairRDD<String, Integer> resultRDD = sqlContext.sql(sec.getRuntimeArguments().get("sql.statement")).toJavaRDD().mapToPair(new PairFunction<Row, String, Integer>() {
@Override
public Tuple2<String, Integer> call(Row row) throws Exception {
return new Tuple2<>(row.getString(0), row.getInt(1));
}
});
sec.saveAsDataset(resultRDD, sec.getRuntimeArguments().get("output.dataset"));
}
Aggregations