Search in sources :

Example 61 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class AuthorsAndBooksWithDates method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Authors and Books").master("local").getOrCreate();
    String filename = "data/authors.csv";
    Dataset<Row> authorsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").option("dateFormat", "mm/dd/yy").load(filename);
    authorsDf.show();
    filename = "data/books.csv";
    Dataset<Row> booksDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
    booksDf.show();
    Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer").withColumn("bookId", booksDf.col("id")).drop(booksDf.col("id"));
    libraryDf.show();
    libraryDf.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 62 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class CsvToDatasetBook method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>").master("local").getOrCreate();
    String filename = "data/books.csv";
    Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
    df.show();
    Dataset<Book> bookDf = df.map(new BookMapper(), Encoders.bean(Book.class));
    bookDf.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Book(net.jgp.labs.spark.x.model.Book) Row(org.apache.spark.sql.Row)

Example 63 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class SimplePredictionFromTextFile method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Simple prediction from Text File").master("local").getOrCreate();
    spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT());
    String filename = "data/tuple-data-file.csv";
    StructType schema = new StructType(new StructField[] { new StructField("_c0", DataTypes.DoubleType, false, Metadata.empty()), new StructField("_c1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), true, Metadata.empty()) });
    Dataset<Row> df = spark.read().format("csv").schema(schema).option("header", "false").load(filename);
    df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0");
    df = df.withColumn("label", df.col("_c1")).drop("_c1");
    df.printSchema();
    df = df.withColumn("features", callUDF("vectorBuilder", df.col("valuefeatures")));
    df.printSchema();
    df.show();
    // .setRegParam(1).setElasticNetParam(1);
    LinearRegression lr = new LinearRegression().setMaxIter(20);
    // Fit the model to the data.
    LinearRegressionModel model = lr.fit(df);
    // Given a dataset, predict each point's label, and show the results.
    model.transform(df).show();
    LinearRegressionTrainingSummary trainingSummary = model.summary();
    System.out.println("numIterations: " + trainingSummary.totalIterations());
    System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
    trainingSummary.residuals().show();
    System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
    System.out.println("r2: " + trainingSummary.r2());
    double intercept = model.intercept();
    System.out.println("Interesection: " + intercept);
    double regParam = model.getRegParam();
    System.out.println("Regression parameter: " + regParam);
    double tol = model.getTol();
    System.out.println("Tol: " + tol);
    Double feature = 7.0;
    Vector features = Vectors.dense(feature);
    double p = model.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p);
    System.out.println(8 * regParam + intercept);
}
Also used : VectorUDT(org.apache.spark.ml.linalg.VectorUDT) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) LinearRegressionModel(org.apache.spark.ml.regression.LinearRegressionModel) StructField(org.apache.spark.sql.types.StructField) VectorBuilder(net.jgp.labs.spark.x.udf.VectorBuilder) Row(org.apache.spark.sql.Row) LinearRegression(org.apache.spark.ml.regression.LinearRegression) Vector(org.apache.spark.ml.linalg.Vector) LinearRegressionTrainingSummary(org.apache.spark.ml.regression.LinearRegressionTrainingSummary)

Example 64 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class AuthorsAndBooksCountBooksApp method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Authors and Books").master("local").getOrCreate();
    String filename = "data/authors.csv";
    Dataset<Row> authorsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
    authorsDf.show();
    filename = "data/books.csv";
    Dataset<Row> booksDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
    booksDf.show();
    Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "left").withColumn("bookId", booksDf.col("id")).drop(booksDf.col("id")).groupBy(authorsDf.col("id"), authorsDf.col("name"), authorsDf.col("link")).count();
    libraryDf.show();
    libraryDf.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 65 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class BookUrlBuilderApp method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Book URL Builder").master("local").getOrCreate();
    String filename = "data/books.csv";
    Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
    df.show();
    Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING());
    ds.printSchema();
    ds.show(20, 80);
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Aggregations

Row (org.apache.spark.sql.Row)129 Test (org.junit.Test)60 Script (org.apache.sysml.api.mlcontext.Script)53 StructType (org.apache.spark.sql.types.StructType)50 ArrayList (java.util.ArrayList)48 StructField (org.apache.spark.sql.types.StructField)46 SparkSession (org.apache.spark.sql.SparkSession)43 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)19 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)19 MLResults (org.apache.sysml.api.mlcontext.MLResults)18 DenseVector (org.apache.spark.ml.linalg.DenseVector)16 Vector (org.apache.spark.ml.linalg.Vector)16 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)15 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 SQLContext (org.apache.spark.sql.SQLContext)12 User (uk.gov.gchq.gaffer.user.User)12 HashSet (java.util.HashSet)10 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)9 Tuple2 (scala.Tuple2)9 GetDataFrameOfElements (uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements)9