use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.
the class CsvToDatasetBookAsJson method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book> as JSON").master("local").getOrCreate();
String filename = "data/books.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
df.show();
Dataset<String> bookDf = df.map(new BookMapper(), Encoders.STRING());
bookDf.show(20, 132);
Dataset<Row> bookAsJsonDf = spark.read().json(bookDf);
bookAsJsonDf.show();
}
use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.
the class BuildDataFrameFromScratch2 method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Build a DataFrame from Scratch").master("local[*]").getOrCreate();
List<String[]> stringAsList = new ArrayList<>();
stringAsList.add(new String[] { "bar1.1", "bar2.1" });
stringAsList.add(new String[] { "bar1.2", "bar2.2" });
JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((String[] row) -> RowFactory.create(row));
// Creates schema
StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("foe1", DataTypes.StringType, false), DataTypes.createStructField("foe2", DataTypes.StringType, false) });
Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();
log.debug("** Schema: ");
df.printSchema();
log.debug("** Data: ");
df.show();
sparkContext.close();
}
use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.
the class Reader method start.
private void start() {
SparkConf conf = new SparkConf().setAppName("Concurrency Lab 001").setMaster(Config.MASTER);
JavaSparkContext sc = new JavaSparkContext(conf);
SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
conf = spark.sparkContext().conf();
System.out.println(conf.get("hello"));
Dataset<Row> df = spark.sql("SELECT * from myView");
df.show();
}
use of org.apache.spark.sql.Row in project incubator-systemml by apache.
the class MLContextUtil method convertInputType.
/**
* Convert input types to internal SystemML representations
*
* @param parameterName
* The name of the input parameter
* @param parameterValue
* The value of the input parameter
* @param metadata
* matrix/frame metadata
* @return input in SystemML data representation
*/
public static Data convertInputType(String parameterName, Object parameterValue, Metadata metadata) {
String name = parameterName;
Object value = parameterValue;
boolean hasMetadata = (metadata != null) ? true : false;
boolean hasMatrixMetadata = hasMetadata && (metadata instanceof MatrixMetadata) ? true : false;
boolean hasFrameMetadata = hasMetadata && (metadata instanceof FrameMetadata) ? true : false;
if (name == null) {
throw new MLContextException("Input parameter name is null");
} else if (value == null) {
throw new MLContextException("Input parameter value is null for: " + parameterName);
} else if (value instanceof JavaRDD<?>) {
@SuppressWarnings("unchecked") JavaRDD<String> javaRDD = (JavaRDD<String>) value;
if (hasMatrixMetadata) {
MatrixMetadata matrixMetadata = (MatrixMetadata) metadata;
if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) {
return MLContextConversionUtil.javaRDDStringIJVToMatrixObject(javaRDD, matrixMetadata);
} else {
return MLContextConversionUtil.javaRDDStringCSVToMatrixObject(javaRDD, matrixMetadata);
}
} else if (hasFrameMetadata) {
FrameMetadata frameMetadata = (FrameMetadata) metadata;
if (frameMetadata.getFrameFormat() == FrameFormat.IJV) {
return MLContextConversionUtil.javaRDDStringIJVToFrameObject(javaRDD, frameMetadata);
} else {
return MLContextConversionUtil.javaRDDStringCSVToFrameObject(javaRDD, frameMetadata);
}
} else if (!hasMetadata) {
String firstLine = javaRDD.first();
boolean isAllNumbers = isCSVLineAllNumbers(firstLine);
if (isAllNumbers) {
return MLContextConversionUtil.javaRDDStringCSVToMatrixObject(javaRDD);
} else {
return MLContextConversionUtil.javaRDDStringCSVToFrameObject(javaRDD);
}
}
} else if (value instanceof RDD<?>) {
@SuppressWarnings("unchecked") RDD<String> rdd = (RDD<String>) value;
if (hasMatrixMetadata) {
MatrixMetadata matrixMetadata = (MatrixMetadata) metadata;
if (matrixMetadata.getMatrixFormat() == MatrixFormat.IJV) {
return MLContextConversionUtil.rddStringIJVToMatrixObject(rdd, matrixMetadata);
} else {
return MLContextConversionUtil.rddStringCSVToMatrixObject(rdd, matrixMetadata);
}
} else if (hasFrameMetadata) {
FrameMetadata frameMetadata = (FrameMetadata) metadata;
if (frameMetadata.getFrameFormat() == FrameFormat.IJV) {
return MLContextConversionUtil.rddStringIJVToFrameObject(rdd, frameMetadata);
} else {
return MLContextConversionUtil.rddStringCSVToFrameObject(rdd, frameMetadata);
}
} else if (!hasMetadata) {
String firstLine = rdd.first();
boolean isAllNumbers = isCSVLineAllNumbers(firstLine);
if (isAllNumbers) {
return MLContextConversionUtil.rddStringCSVToMatrixObject(rdd);
} else {
return MLContextConversionUtil.rddStringCSVToFrameObject(rdd);
}
}
} else if (value instanceof MatrixBlock) {
MatrixBlock matrixBlock = (MatrixBlock) value;
return MLContextConversionUtil.matrixBlockToMatrixObject(name, matrixBlock, (MatrixMetadata) metadata);
} else if (value instanceof FrameBlock) {
FrameBlock frameBlock = (FrameBlock) value;
return MLContextConversionUtil.frameBlockToFrameObject(name, frameBlock, (FrameMetadata) metadata);
} else if (value instanceof Dataset<?>) {
@SuppressWarnings("unchecked") Dataset<Row> dataFrame = (Dataset<Row>) value;
dataFrame = MLUtils.convertVectorColumnsToML(dataFrame);
if (hasMatrixMetadata) {
return MLContextConversionUtil.dataFrameToMatrixObject(dataFrame, (MatrixMetadata) metadata);
} else if (hasFrameMetadata) {
return MLContextConversionUtil.dataFrameToFrameObject(dataFrame, (FrameMetadata) metadata);
} else if (!hasMetadata) {
boolean looksLikeMatrix = doesDataFrameLookLikeMatrix(dataFrame);
if (looksLikeMatrix) {
return MLContextConversionUtil.dataFrameToMatrixObject(dataFrame);
} else {
return MLContextConversionUtil.dataFrameToFrameObject(dataFrame);
}
}
} else if (value instanceof Matrix) {
Matrix matrix = (Matrix) value;
if ((matrix.hasBinaryBlocks()) && (!matrix.hasMatrixObject())) {
if (metadata == null) {
metadata = matrix.getMatrixMetadata();
}
JavaPairRDD<MatrixIndexes, MatrixBlock> binaryBlocks = matrix.toBinaryBlocks();
return MLContextConversionUtil.binaryBlocksToMatrixObject(binaryBlocks, (MatrixMetadata) metadata);
} else {
return matrix.toMatrixObject();
}
} else if (value instanceof Frame) {
Frame frame = (Frame) value;
if ((frame.hasBinaryBlocks()) && (!frame.hasFrameObject())) {
if (metadata == null) {
metadata = frame.getFrameMetadata();
}
JavaPairRDD<Long, FrameBlock> binaryBlocks = frame.toBinaryBlocks();
return MLContextConversionUtil.binaryBlocksToFrameObject(binaryBlocks, (FrameMetadata) metadata);
} else {
return frame.toFrameObject();
}
} else if (value instanceof double[][]) {
double[][] doubleMatrix = (double[][]) value;
return MLContextConversionUtil.doubleMatrixToMatrixObject(name, doubleMatrix, (MatrixMetadata) metadata);
} else if (value instanceof URL) {
URL url = (URL) value;
return MLContextConversionUtil.urlToMatrixObject(url, (MatrixMetadata) metadata);
} else if (value instanceof Integer) {
return new IntObject((Integer) value);
} else if (value instanceof Double) {
return new DoubleObject((Double) value);
} else if (value instanceof String) {
return new StringObject((String) value);
} else if (value instanceof Boolean) {
return new BooleanObject((Boolean) value);
}
return null;
}
use of org.apache.spark.sql.Row in project incubator-systemml by apache.
the class DataFrameMatrixConversionTest method testDataFrameConversion.
private void testDataFrameConversion(boolean vector, int cols, boolean dense, boolean unknownDims) {
boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
RUNTIME_PLATFORM oldPlatform = DMLScript.rtplatform;
try {
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
// generate input data and setup metadata
int rows = (cols == cols3) ? rows3 : rows1;
double sparsity = dense ? sparsity1 : sparsity2;
double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 2373);
MatrixBlock mbA = DataConverter.convertToMatrixBlock(A);
int blksz = ConfigurationManager.getBlocksize();
MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, blksz, mbA.getNonZeros());
MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);
// get binary block input rdd
JavaPairRDD<MatrixIndexes, MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz, blksz);
// matrix - dataframe - matrix conversion
Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(spark, in, mc1, vector);
df = (rows == rows3) ? df.repartition(rows) : df;
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc2, true, vector);
// get output matrix block
MatrixBlock mbB = SparkExecutionContext.toMatrixBlock(out, rows, cols, blksz, blksz, -1);
// compare matrix blocks
double[][] B = DataConverter.convertToDoubleMatrix(mbB);
TestUtils.compareMatrices(A, B, rows, cols, eps);
} catch (Exception ex) {
throw new RuntimeException(ex);
} finally {
DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
DMLScript.rtplatform = oldPlatform;
}
}
Aggregations