use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.
the class MLContextFrameTest method testInputFrameAndMatrixOutputMatrix.
@Test
public void testInputFrameAndMatrixOutputMatrix() {
System.out.println("MLContextFrameTest - input frame and matrix, output matrix");
List<String> dataA = new ArrayList<String>();
dataA.add("Test1,4.0");
dataA.add("Test2,5.0");
dataA.add("Test3,6.0");
JavaRDD<String> javaRddStringA = sc.parallelize(dataA);
ValueType[] schema = { ValueType.STRING, ValueType.DOUBLE };
List<String> dataB = new ArrayList<String>();
dataB.add("1.0");
dataB.add("2.0");
JavaRDD<String> javaRddStringB = sc.parallelize(dataB);
JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRddStringA, CSV_DELIM, schema);
JavaRDD<Row> javaRddRowB = javaRddStringB.map(new CommaSeparatedValueStringToDoubleArrayRow());
List<StructField> fieldsA = new ArrayList<StructField>();
fieldsA.add(DataTypes.createStructField("1", DataTypes.StringType, true));
fieldsA.add(DataTypes.createStructField("2", DataTypes.DoubleType, true));
StructType schemaA = DataTypes.createStructType(fieldsA);
Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);
List<StructField> fieldsB = new ArrayList<StructField>();
fieldsB.add(DataTypes.createStructField("1", DataTypes.DoubleType, true));
StructType schemaB = DataTypes.createStructType(fieldsB);
Dataset<Row> dataFrameB = spark.createDataFrame(javaRddRowB, schemaB);
String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: true ,recode: [ 1, 2 ]}\");\n" + "C = tA %*% B;\n" + "M = s * C;";
Script script = dml(dmlString).in("A", dataFrameA, new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length)).in("B", dataFrameB, new MatrixMetadata(MatrixFormat.CSV, dataFrameB.count(), (long) dataFrameB.columns().length)).in("s", 2).out("M");
MLResults results = ml.execute(script);
double[][] matrix = results.getMatrixAs2DDoubleArray("M");
Assert.assertEquals(6.0, matrix[0][0], 0.0);
Assert.assertEquals(12.0, matrix[1][0], 0.0);
Assert.assertEquals(18.0, matrix[2][0], 0.0);
}
use of org.apache.spark.sql.types.StructField in project net.jgp.labs.spark by jgperrin.
the class StreamingIngestionFileSystemTextFileToDataframeApp method start.
private void start() {
// Create a local StreamingContext with two working thread and batch interval of
// 1 second
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
msgDataStream.print();
// Create JavaRDD<Row>
msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
private static final long serialVersionUID = -590010339928376829L;
@Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
private static final long serialVersionUID = 5167089361335095997L;
@Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
// Create Schema
StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
// Get Spark 2.0 session
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
msgDataFrame.show();
}
});
jssc.start();
try {
jssc.awaitTermination();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of org.apache.spark.sql.types.StructField in project net.jgp.labs.spark by jgperrin.
the class SimplePredictionFromTextFile method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Simple prediction from Text File").master("local").getOrCreate();
spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT());
String filename = "data/tuple-data-file.csv";
StructType schema = new StructType(new StructField[] { new StructField("_c0", DataTypes.DoubleType, false, Metadata.empty()), new StructField("_c1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), true, Metadata.empty()) });
Dataset<Row> df = spark.read().format("csv").schema(schema).option("header", "false").load(filename);
df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0");
df = df.withColumn("label", df.col("_c1")).drop("_c1");
df.printSchema();
df = df.withColumn("features", callUDF("vectorBuilder", df.col("valuefeatures")));
df.printSchema();
df.show();
// .setRegParam(1).setElasticNetParam(1);
LinearRegression lr = new LinearRegression().setMaxIter(20);
// Fit the model to the data.
LinearRegressionModel model = lr.fit(df);
// Given a dataset, predict each point's label, and show the results.
model.transform(df).show();
LinearRegressionTrainingSummary trainingSummary = model.summary();
System.out.println("numIterations: " + trainingSummary.totalIterations());
System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
trainingSummary.residuals().show();
System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
System.out.println("r2: " + trainingSummary.r2());
double intercept = model.intercept();
System.out.println("Interesection: " + intercept);
double regParam = model.getRegParam();
System.out.println("Regression parameter: " + regParam);
double tol = model.getTol();
System.out.println("Tol: " + tol);
Double feature = 7.0;
Vector features = Vectors.dense(feature);
double p = model.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p);
System.out.println(8 * regParam + intercept);
}
use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.
the class MLContextUtil method doesDataFrameLookLikeMatrix.
/**
* Examine the DataFrame schema to determine whether the data appears to be
* a matrix.
*
* @param df
* the DataFrame
* @return {@code true} if the DataFrame appears to be a matrix,
* {@code false} otherwise
*/
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
StructType schema = df.schema();
StructField[] fields = schema.fields();
if (fields == null) {
return true;
}
for (StructField field : fields) {
DataType dataType = field.dataType();
if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType) && (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT)) && (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
// }
return false;
}
}
return true;
}
use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.
the class MLContextTest method testDataFrameSumPYDMLDoublesWithNoIDColumn.
@Test
public void testDataFrameSumPYDMLDoublesWithNoIDColumn() {
System.out.println("MLContextTest - DataFrame sum PYDML, doubles with no ID column");
List<String> list = new ArrayList<String>();
list.add("10,20,30");
list.add("40,50,60");
list.add("70,80,90");
JavaRDD<String> javaRddString = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_DOUBLES);
Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
setExpectedStdOut("sum: 450.0");
ml.execute(script);
}
Aggregations