use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class CsvWithDoubleHeaderToDataset method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
String filename = "data/csv-double-header.txt";
// TODO
StructType schema = buildSchemaFromCsvDefinition("1st line of file", "2nd line of file");
// I use a dirty comment trick to avoid manipulating the data file, but
// one could build the method...
Dataset<Row> df = spark.read().schema(schema).option("inferSchema", "false").option("comment", "#").option("header", "true").option("mode", "DROPMALFORMED").csv(filename);
df.show();
df.printSchema();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class CsvToDatasetCompatibleWithSparkv1x method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
String filename = "data/tuple-data-file.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
df.show();
// To ensure compatibility between Spark 2.0.0 and Spark 1.6.x
int count = df.columns().length;
for (int i = 0; i < count; i++) {
String oldColName = "_c" + i;
String newColName = "C" + i;
df = df.withColumn(newColName, df.col(oldColName)).drop(oldColName);
}
df.show();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class CsvWithHeaderToDataset method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
String filename = "data/csv-q.txt";
Dataset<Row> df = spark.read().option("inferSchema", "true").option("header", "true").csv(filename);
df.show();
df.printSchema();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class XmlToDataset method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("XML to Dataset").master("local").getOrCreate();
String filename = "data/budget-2017.xml";
long start = System.currentTimeMillis();
Dataset<Row> df = spark.read().format("xml").option("rowTag", "item").load(filename);
long stop = System.currentTimeMillis();
System.out.println("Processing took " + (stop - start) + " ms");
df.show();
df.printSchema();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class TextFileToDataset method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Dataset from Text File").master("local[*]").getOrCreate();
String filename = "data/simple-data-file.txt";
Dataset<Row> df = spark.read().text(filename);
df.show();
}
Aggregations