Search in sources :

Example 51 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class JsonArrayToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("JSON array to Dataset").master("local").getOrCreate();
    String filename = "data/array.json";
    long start = System.currentTimeMillis();
    Dataset<Row> df = spark.read().json(filename);
    long stop = System.currentTimeMillis();
    System.out.println("Processing took " + (stop - start) + " ms");
    df.show();
    df.printSchema();
    // Turns the "one liner" into a real column
    df = df.select(explode(df.col("valsInArrays"))).toDF("vals");
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 52 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class JsonMapToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("JSON map to Dataset").master("local").getOrCreate();
    String filename = "data/map.json";
    long start = System.currentTimeMillis();
    Dataset<Row> df = spark.read().json(filename);
    long stop = System.currentTimeMillis();
    System.out.println("Processing took " + (stop - start) + " ms");
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 53 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class QuotedCsvWithHeaderToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
    String filename = "data/csv-quoted.txt";
    Dataset<Row> df = spark.read().option("inferSchema", "true").option("header", "true").csv(filename);
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 54 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class MySQLToDatasetApp method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Dataset from MySQL JDBC Connection").master("local").getOrCreate();
    java.util.Properties props = new Properties();
    props.put("user", "root");
    props.put("password", "password");
    props.put("useSSL", "false");
    Dataset<Row> df = spark.read().jdbc("jdbc:mysql://localhost:3306/sakila?serverTimezone=EST", "actor", props);
    df = df.orderBy(df.col("last_name"));
    df.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Properties(java.util.Properties) Row(org.apache.spark.sql.Row) Properties(java.util.Properties)

Example 55 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class ArrayToDataframeApp method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Array to Dataframe").master("local").getOrCreate();
    String[] l = new String[] { "a", "b", "c", "d" };
    List<String> data = Arrays.asList(l);
    Dataset<String> ds = spark.createDataset(data, Encoders.STRING());
    Dataset<Row> df = ds.toDF();
    df.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Aggregations

Row (org.apache.spark.sql.Row)129 Test (org.junit.Test)60 Script (org.apache.sysml.api.mlcontext.Script)53 StructType (org.apache.spark.sql.types.StructType)50 ArrayList (java.util.ArrayList)48 StructField (org.apache.spark.sql.types.StructField)46 SparkSession (org.apache.spark.sql.SparkSession)43 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)19 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)19 MLResults (org.apache.sysml.api.mlcontext.MLResults)18 DenseVector (org.apache.spark.ml.linalg.DenseVector)16 Vector (org.apache.spark.ml.linalg.Vector)16 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)15 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 SQLContext (org.apache.spark.sql.SQLContext)12 User (uk.gov.gchq.gaffer.user.User)12 HashSet (java.util.HashSet)10 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)9 Tuple2 (scala.Tuple2)9 GetDataFrameOfElements (uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements)9