Search in sources :

Example 31 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class CsvWithDoubleHeaderToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
    String filename = "data/csv-double-header.txt";
    // TODO
    StructType schema = buildSchemaFromCsvDefinition("1st line of file", "2nd line of file");
    // I use a dirty comment trick to avoid manipulating the data file, but
    // one could build the method...
    Dataset<Row> df = spark.read().schema(schema).option("inferSchema", "false").option("comment", "#").option("header", "true").option("mode", "DROPMALFORMED").csv(filename);
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) Row(org.apache.spark.sql.Row)

Example 32 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class CsvToDatasetCompatibleWithSparkv1x method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
    String filename = "data/tuple-data-file.csv";
    Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
    df.show();
    // To ensure compatibility between Spark 2.0.0 and Spark 1.6.x
    int count = df.columns().length;
    for (int i = 0; i < count; i++) {
        String oldColName = "_c" + i;
        String newColName = "C" + i;
        df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName);
    }
    df.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 33 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class CsvWithHeaderToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
    String filename = "data/csv-q.txt";
    Dataset<Row> df = spark.read().option("inferSchema", "true").option("header", "true").csv(filename);
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 34 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class XmlToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("XML to Dataset").master("local").getOrCreate();
    String filename = "data/budget-2017.xml";
    long start = System.currentTimeMillis();
    Dataset<Row> df = spark.read().format("xml").option("rowTag", "item").load(filename);
    long stop = System.currentTimeMillis();
    System.out.println("Processing took " + (stop - start) + " ms");
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 35 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class TextFileToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Dataset from Text File").master("local[*]").getOrCreate();
    String filename = "data/simple-data-file.txt";
    Dataset<Row> df = spark.read().text(filename);
    df.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Aggregations

SparkSession (org.apache.spark.sql.SparkSession)53 Row (org.apache.spark.sql.Row)43 StructType (org.apache.spark.sql.types.StructType)11 ArrayList (java.util.ArrayList)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 StructField (org.apache.spark.sql.types.StructField)6 SparkConf (org.apache.spark.SparkConf)4 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Script (org.apache.sysml.api.mlcontext.Script)3 Test (org.junit.Test)3 Dataset (org.apache.spark.sql.Dataset)2 StreamingQuery (org.apache.spark.sql.streaming.StreamingQuery)2 StreamingQueryException (org.apache.spark.sql.streaming.StreamingQueryException)2 DMLScript (org.apache.sysml.api.DMLScript)2 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)2 MLContext (org.apache.sysml.api.mlcontext.MLContext)2 Matrix (org.apache.sysml.api.mlcontext.Matrix)2 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)2 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)2 File (java.io.File)1