Search in sources :

Example 31 with SQLContext

use of org.apache.spark.sql.SQLContext in project Gaffer by gchq.

the class FilterToOperationConverterTest method testSpecifyDestination.

@Test
public void testSpecifyDestination() throws OperationException {
    final Schema schema = getSchema();
    final SQLContext sqlContext = getSqlContext("testSpecifyDestination");
    final Filter[] filters = new Filter[1];
    filters[0] = new EqualTo(SchemaToStructTypeConverter.DST_COL_NAME, "0");
    final FiltersToOperationConverter converter = new FiltersToOperationConverter(sqlContext, getViewFromSchema(schema), schema, filters);
    final AbstractGetRDD<?> operation = converter.getOperation();
    assertTrue(operation instanceof GetRDDOfElements);
    assertEquals(0, operation.getView().getEntityGroups().size());
    assertEquals(EDGE_GROUPS, operation.getView().getEdgeGroups());
    final Set<EntitySeed> seeds = new HashSet<>();
    for (final Object seed : ((GetRDDOfElements) operation).getSeeds()) {
        seeds.add((EntitySeed) seed);
    }
    assertEquals(Collections.singleton(new EntitySeed("0")), seeds);
    sqlContext.sparkContext().stop();
}
Also used : Filter(org.apache.spark.sql.sources.Filter) Schema(uk.gov.gchq.gaffer.store.schema.Schema) EntitySeed(uk.gov.gchq.gaffer.operation.data.EntitySeed) SQLContext(org.apache.spark.sql.SQLContext) EqualTo(org.apache.spark.sql.sources.EqualTo) GetRDDOfElements(uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfElements) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 32 with SQLContext

use of org.apache.spark.sql.SQLContext in project Gaffer by gchq.

the class FilterToOperationConverterTest method testTwoGroups.

@Test
public void testTwoGroups() throws OperationException {
    final Schema schema = getSchema();
    final SQLContext sqlContext = getSqlContext("testTwoGroups");
    final Filter[] filters = new Filter[1];
    final Filter left = new EqualTo(SchemaToStructTypeConverter.GROUP, ENTITY_GROUP);
    final Filter right = new EqualTo(SchemaToStructTypeConverter.GROUP, EDGE_GROUP2);
    filters[0] = new Or(left, right);
    final FiltersToOperationConverter converter = new FiltersToOperationConverter(sqlContext, getViewFromSchema(schema), schema, filters);
    final AbstractGetRDD<?> operation = converter.getOperation();
    assertTrue(operation instanceof GetRDDOfAllElements);
    assertEquals(Collections.singleton(ENTITY_GROUP), operation.getView().getEntityGroups());
    assertEquals(Collections.singleton(EDGE_GROUP2), operation.getView().getEdgeGroups());
    sqlContext.sparkContext().stop();
}
Also used : Or(org.apache.spark.sql.sources.Or) Filter(org.apache.spark.sql.sources.Filter) GetRDDOfAllElements(uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfAllElements) Schema(uk.gov.gchq.gaffer.store.schema.Schema) SQLContext(org.apache.spark.sql.SQLContext) EqualTo(org.apache.spark.sql.sources.EqualTo) Test(org.junit.Test)

Example 33 with SQLContext

use of org.apache.spark.sql.SQLContext in project mongo-hadoop by mongodb.

the class DataframeExample method run.

public void run() {
    JavaSparkContext sc = new JavaSparkContext(new SparkConf());
    // Set configuration options for the MongoDB Hadoop Connector.
    Configuration mongodbConfig = new Configuration();
    // MongoInputFormat allows us to read from a live MongoDB instance.
    // We could also use BSONFileInputFormat to read BSON snapshots.
    mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat");
    // MongoDB connection string naming a collection to use.
    // If using BSON, use "mapred.input.dir" to configure the directory
    // where BSON files are located instead.
    mongodbConfig.set("mongo.input.uri", "mongodb://localhost:27017/enron_mail.messages");
    // Create an RDD backed by the MongoDB collection.
    JavaPairRDD<Object, BSONObject> documents = sc.newAPIHadoopRDD(// Configuration
    mongodbConfig, // InputFormat: read from a live cluster.
    MongoInputFormat.class, // Key class
    Object.class, // Value class
    BSONObject.class);
    JavaRDD<Message> messages = documents.map(new Function<Tuple2<Object, BSONObject>, Message>() {

        public Message call(final Tuple2<Object, BSONObject> tuple) {
            Message m = new Message();
            BSONObject header = (BSONObject) tuple._2().get("headers");
            m.setTo((String) header.get("To"));
            m.setxFrom((String) header.get("From"));
            m.setMessageID((String) header.get("Message-ID"));
            m.setBody((String) tuple._2().get("body"));
            return m;
        }
    });
    SQLContext sqlContext = new org.apache.spark.sql.SQLContext(sc);
    DataFrame messagesSchema = sqlContext.createDataFrame(messages, Message.class);
    messagesSchema.registerTempTable("messages");
    DataFrame ericsMessages = sqlContext.sql("SELECT to, body FROM messages WHERE to = \"eric.bass@enron.com\"");
    ericsMessages.show();
    messagesSchema.printSchema();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) BSONObject(org.bson.BSONObject) DataFrame(org.apache.spark.sql.DataFrame) Tuple2(scala.Tuple2) BSONObject(org.bson.BSONObject) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf) SQLContext(org.apache.spark.sql.SQLContext)

Example 34 with SQLContext

use of org.apache.spark.sql.SQLContext in project cdap by caskdata.

the class StreamFormatSpecSpark method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    SQLContext sqlContext = new SQLContext(jsc);
    // Read from CSV stream and turn it into a DataFrame
    String streamName = sec.getRuntimeArguments().get("stream.name");
    Schema schema = Schema.recordOf("record", ImmutableList.of(Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("age", Schema.of(Schema.Type.INT))));
    FormatSpecification formatSpec = new FormatSpecification("csv", schema);
    JavaPairRDD<Long, GenericStreamEventData<StructuredRecord>> rdd = sec.fromStream(streamName, formatSpec, StructuredRecord.class);
    JavaRDD<Person> personRDD = rdd.values().map(new Function<GenericStreamEventData<StructuredRecord>, Person>() {

        @Override
        public Person call(GenericStreamEventData<StructuredRecord> data) throws Exception {
            StructuredRecord record = data.getBody();
            return new Person(record.<String>get("name"), record.<Integer>get("age"));
        }
    });
    sqlContext.createDataFrame(personRDD, Person.class).registerTempTable("people");
    // Execute a SQL on the table and save the result
    JavaPairRDD<String, Integer> resultRDD = sqlContext.sql(sec.getRuntimeArguments().get("sql.statement")).toJavaRDD().mapToPair(new PairFunction<Row, String, Integer>() {

        @Override
        public Tuple2<String, Integer> call(Row row) throws Exception {
            return new Tuple2<>(row.getString(0), row.getInt(1));
        }
    });
    sec.saveAsDataset(resultRDD, sec.getRuntimeArguments().get("output.dataset"));
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) FormatSpecification(co.cask.cdap.api.data.format.FormatSpecification) GenericStreamEventData(co.cask.cdap.api.stream.GenericStreamEventData) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SQLContext(org.apache.spark.sql.SQLContext)

Aggregations

SQLContext (org.apache.spark.sql.SQLContext)34 Test (org.junit.Test)20 HashSet (java.util.HashSet)15 Schema (uk.gov.gchq.gaffer.store.schema.Schema)14 Row (org.apache.spark.sql.Row)12 Filter (org.apache.spark.sql.sources.Filter)11 User (uk.gov.gchq.gaffer.user.User)11 View (uk.gov.gchq.gaffer.data.elementdefinition.view.View)10 EqualTo (org.apache.spark.sql.sources.EqualTo)9 Graph (uk.gov.gchq.gaffer.graph.Graph)9 GetDataFrameOfElements (uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements)8 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)7 MutableList (scala.collection.mutable.MutableList)7 ArrayList (java.util.ArrayList)6 DataFrame (org.apache.spark.sql.DataFrame)5 IsMoreThan (uk.gov.gchq.gaffer.function.filter.IsMoreThan)5 EntitySeed (uk.gov.gchq.gaffer.operation.data.EntitySeed)5 GetRDDOfElements (uk.gov.gchq.gaffer.spark.operation.scalardd.GetRDDOfElements)5 SparkConf (org.apache.spark.SparkConf)4 SparkContext (org.apache.spark.SparkContext)4