Search in sources :

Example 31 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextFrameTest method testInputFrameAndMatrixOutputMatrix.

@Test
public void testInputFrameAndMatrixOutputMatrix() {
    System.out.println("MLContextFrameTest - input frame and matrix, output matrix");
    List<String> dataA = new ArrayList<String>();
    dataA.add("Test1,4.0");
    dataA.add("Test2,5.0");
    dataA.add("Test3,6.0");
    JavaRDD<String> javaRddStringA = sc.parallelize(dataA);
    ValueType[] schema = { ValueType.STRING, ValueType.DOUBLE };
    List<String> dataB = new ArrayList<String>();
    dataB.add("1.0");
    dataB.add("2.0");
    JavaRDD<String> javaRddStringB = sc.parallelize(dataB);
    JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRddStringA, CSV_DELIM, schema);
    JavaRDD<Row> javaRddRowB = javaRddStringB.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fieldsA = new ArrayList<StructField>();
    fieldsA.add(DataTypes.createStructField("1", DataTypes.StringType, true));
    fieldsA.add(DataTypes.createStructField("2", DataTypes.DoubleType, true));
    StructType schemaA = DataTypes.createStructType(fieldsA);
    Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);
    List<StructField> fieldsB = new ArrayList<StructField>();
    fieldsB.add(DataTypes.createStructField("1", DataTypes.DoubleType, true));
    StructType schemaB = DataTypes.createStructType(fieldsB);
    Dataset<Row> dataFrameB = spark.createDataFrame(javaRddRowB, schemaB);
    String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: true ,recode: [ 1, 2 ]}\");\n" + "C = tA %*% B;\n" + "M = s * C;";
    Script script = dml(dmlString).in("A", dataFrameA, new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length)).in("B", dataFrameB, new MatrixMetadata(MatrixFormat.CSV, dataFrameB.count(), (long) dataFrameB.columns().length)).in("s", 2).out("M");
    MLResults results = ml.execute(script);
    double[][] matrix = results.getMatrixAs2DDoubleArray("M");
    Assert.assertEquals(6.0, matrix[0][0], 0.0);
    Assert.assertEquals(12.0, matrix[1][0], 0.0);
    Assert.assertEquals(18.0, matrix[2][0], 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata) Test(org.junit.Test)

Example 32 with StructType

use of org.apache.spark.sql.types.StructType in project net.jgp.labs.spark by jgperrin.

the class StreamingIngestionFileSystemTextFileToDataframeApp method start.

private void start() {
    // Create a local StreamingContext with two working thread and batch interval of
    // 1 second
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
    JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
    msgDataStream.print();
    // Create JavaRDD<Row>
    msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {

        private static final long serialVersionUID = -590010339928376829L;

        @Override
        public void call(JavaRDD<String> rdd) {
            JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {

                private static final long serialVersionUID = 5167089361335095997L;

                @Override
                public Row call(String msg) {
                    Row row = RowFactory.create(msg);
                    return row;
                }
            });
            // Create Schema
            StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
            // Get Spark 2.0 session
            SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
            Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
            msgDataFrame.show();
        }
    });
    jssc.start();
    try {
        jssc.awaitTermination();
    } catch (InterruptedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) Dataset(org.apache.spark.sql.Dataset) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 33 with StructType

use of org.apache.spark.sql.types.StructType in project net.jgp.labs.spark by jgperrin.

the class SimplePredictionFromTextFile method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Simple prediction from Text File").master("local").getOrCreate();
    spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT());
    String filename = "data/tuple-data-file.csv";
    StructType schema = new StructType(new StructField[] { new StructField("_c0", DataTypes.DoubleType, false, Metadata.empty()), new StructField("_c1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), true, Metadata.empty()) });
    Dataset<Row> df = spark.read().format("csv").schema(schema).option("header", "false").load(filename);
    df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0");
    df = df.withColumn("label", df.col("_c1")).drop("_c1");
    df.printSchema();
    df = df.withColumn("features", callUDF("vectorBuilder", df.col("valuefeatures")));
    df.printSchema();
    df.show();
    // .setRegParam(1).setElasticNetParam(1);
    LinearRegression lr = new LinearRegression().setMaxIter(20);
    // Fit the model to the data.
    LinearRegressionModel model = lr.fit(df);
    // Given a dataset, predict each point's label, and show the results.
    model.transform(df).show();
    LinearRegressionTrainingSummary trainingSummary = model.summary();
    System.out.println("numIterations: " + trainingSummary.totalIterations());
    System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
    trainingSummary.residuals().show();
    System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
    System.out.println("r2: " + trainingSummary.r2());
    double intercept = model.intercept();
    System.out.println("Interesection: " + intercept);
    double regParam = model.getRegParam();
    System.out.println("Regression parameter: " + regParam);
    double tol = model.getTol();
    System.out.println("Tol: " + tol);
    Double feature = 7.0;
    Vector features = Vectors.dense(feature);
    double p = model.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p);
    System.out.println(8 * regParam + intercept);
}
Also used : VectorUDT(org.apache.spark.ml.linalg.VectorUDT) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) LinearRegressionModel(org.apache.spark.ml.regression.LinearRegressionModel) StructField(org.apache.spark.sql.types.StructField) VectorBuilder(net.jgp.labs.spark.x.udf.VectorBuilder) Row(org.apache.spark.sql.Row) LinearRegression(org.apache.spark.ml.regression.LinearRegression) Vector(org.apache.spark.ml.linalg.Vector) LinearRegressionTrainingSummary(org.apache.spark.ml.regression.LinearRegressionTrainingSummary)

Example 34 with StructType

use of org.apache.spark.sql.types.StructType in project net.jgp.labs.spark by jgperrin.

the class BuildDataFrameFromScratch2 method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Build a DataFrame from Scratch").master("local[*]").getOrCreate();
    List<String[]> stringAsList = new ArrayList<>();
    stringAsList.add(new String[] { "bar1.1", "bar2.1" });
    stringAsList.add(new String[] { "bar1.2", "bar2.2" });
    JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
    JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((String[] row) -> RowFactory.create(row));
    // Creates schema
    StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("foe1", DataTypes.StringType, false), DataTypes.createStructField("foe2", DataTypes.StringType, false) });
    Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();
    log.debug("** Schema: ");
    df.printSchema();
    log.debug("** Data: ");
    df.show();
    sparkContext.close();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row)

Example 35 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextConversionUtil method determineFrameFormatIfNeeded.

/**
 * If the FrameFormat of the DataFrame has not been explicitly specified,
 * attempt to determine the proper FrameFormat.
 *
 * @param dataFrame
 *            the Spark {@code DataFrame}
 * @param frameMetadata
 *            the frame metadata, if available
 */
public static void determineFrameFormatIfNeeded(Dataset<Row> dataFrame, FrameMetadata frameMetadata) {
    FrameFormat frameFormat = frameMetadata.getFrameFormat();
    if (frameFormat != null) {
        return;
    }
    StructType schema = dataFrame.schema();
    boolean hasID = false;
    try {
        schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
        hasID = true;
    } catch (IllegalArgumentException iae) {
    }
    FrameFormat ff = hasID ? FrameFormat.DF_WITH_INDEX : FrameFormat.DF;
    frameMetadata.setFrameFormat(ff);
}
Also used : StructType(org.apache.spark.sql.types.StructType)

Aggregations

StructType (org.apache.spark.sql.types.StructType)56 Row (org.apache.spark.sql.Row)49 StructField (org.apache.spark.sql.types.StructField)48 ArrayList (java.util.ArrayList)43 Test (org.junit.Test)37 Script (org.apache.sysml.api.mlcontext.Script)35 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)17 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)17 DenseVector (org.apache.spark.ml.linalg.DenseVector)15 Vector (org.apache.spark.ml.linalg.Vector)14 SparkSession (org.apache.spark.sql.SparkSession)11 MLResults (org.apache.sysml.api.mlcontext.MLResults)6 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)6 Tuple2 (scala.Tuple2)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)5 FrameMetadata (org.apache.sysml.api.mlcontext.FrameMetadata)5 DataType (org.apache.spark.sql.types.DataType)4 JavaRDD (org.apache.spark.api.java.JavaRDD)3 ValueType (org.apache.sysml.parser.Expression.ValueType)3 CommaSeparatedValueStringToDoubleArrayRow (org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow)3