Search in sources :

Example 6 with SparkSession

use of org.apache.spark.sql.SparkSession in project incubator-systemml by apache.

the class MLContextParforDatasetTest method runMLContextParforDatasetTest.

private void runMLContextParforDatasetTest(boolean vector, boolean unknownDims, boolean multiInputs) {
    // modify memory budget to trigger fused datapartition-execute
    long oldmem = InfrastructureAnalyzer.getLocalMaxMemory();
    // 1MB
    InfrastructureAnalyzer.setLocalMaxMemory(1 * 1024 * 1024);
    try {
        double[][] A = getRandomMatrix(rows, cols, -10, 10, sparsity, 76543);
        MatrixBlock mbA = DataConverter.convertToMatrixBlock(A);
        int blksz = ConfigurationManager.getBlocksize();
        MatrixCharacteristics mc1 = new MatrixCharacteristics(rows, cols, blksz, blksz, mbA.getNonZeros());
        MatrixCharacteristics mc2 = unknownDims ? new MatrixCharacteristics() : new MatrixCharacteristics(mc1);
        // create input dataset
        SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = SparkExecutionContext.toMatrixJavaPairRDD(sc, mbA, blksz, blksz);
        Dataset<Row> df = RDDConverterUtils.binaryBlockToDataFrame(sparkSession, in, mc1, vector);
        MatrixMetadata mm = new MatrixMetadata(vector ? MatrixFormat.DF_VECTOR_WITH_INDEX : MatrixFormat.DF_DOUBLES_WITH_INDEX);
        mm.setMatrixCharacteristics(mc2);
        String s1 = "v = matrix(0, rows=nrow(X), cols=1)" + "parfor(i in 1:nrow(X), log=DEBUG) {" + "   v[i, ] = sum(X[i, ]);" + "}" + "r = sum(v);";
        String s2 = "v = matrix(0, rows=nrow(X), cols=1)" + "Y = X;" + "parfor(i in 1:nrow(X), log=DEBUG) {" + "   v[i, ] = sum(X[i, ]+Y[i, ]);" + "}" + "r = sum(v);";
        String s = multiInputs ? s2 : s1;
        ml.setExplain(true);
        ml.setExplainLevel(ExplainLevel.RUNTIME);
        ml.setStatistics(true);
        Script script = dml(s).in("X", df, mm).out("r");
        MLResults results = ml.execute(script);
        // compare aggregation results
        double sum1 = results.getDouble("r");
        double sum2 = mbA.sum() * (multiInputs ? 2 : 1);
        TestUtils.compareScalars(sum2, sum1, 0.000001);
    } catch (Exception ex) {
        ex.printStackTrace();
        throw new RuntimeException(ex);
    } finally {
        InfrastructureAnalyzer.setLocalMaxMemory(oldmem);
    }
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) SparkSession(org.apache.spark.sql.SparkSession) MLResults(org.apache.sysml.api.mlcontext.MLResults) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata)

Example 7 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class CsvToDatasetApp method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
    String filename = "data/tuple-data-file.csv";
    Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
    df.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 8 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class JsonArrayToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("JSON array to Dataset").master("local").getOrCreate();
    String filename = "data/array.json";
    long start = System.currentTimeMillis();
    Dataset<Row> df = spark.read().json(filename);
    long stop = System.currentTimeMillis();
    System.out.println("Processing took " + (stop - start) + " ms");
    df.show();
    df.printSchema();
    // Turns the "one liner" into a real column
    df = df.select(explode(df.col("valsInArrays"))).toDF("vals");
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 9 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class JsonMapToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("JSON map to Dataset").master("local").getOrCreate();
    String filename = "data/map.json";
    long start = System.currentTimeMillis();
    Dataset<Row> df = spark.read().json(filename);
    long stop = System.currentTimeMillis();
    System.out.println("Processing took " + (stop - start) + " ms");
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 10 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class QuotedCsvWithHeaderToDataset method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
    String filename = "data/csv-quoted.txt";
    Dataset<Row> df = spark.read().option("inferSchema", "true").option("header", "true").csv(filename);
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Aggregations

SparkSession (org.apache.spark.sql.SparkSession)53 Row (org.apache.spark.sql.Row)43 StructType (org.apache.spark.sql.types.StructType)11 ArrayList (java.util.ArrayList)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 StructField (org.apache.spark.sql.types.StructField)6 SparkConf (org.apache.spark.SparkConf)4 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Script (org.apache.sysml.api.mlcontext.Script)3 Test (org.junit.Test)3 Dataset (org.apache.spark.sql.Dataset)2 StreamingQuery (org.apache.spark.sql.streaming.StreamingQuery)2 StreamingQueryException (org.apache.spark.sql.streaming.StreamingQueryException)2 DMLScript (org.apache.sysml.api.DMLScript)2 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)2 MLContext (org.apache.sysml.api.mlcontext.MLContext)2 Matrix (org.apache.sysml.api.mlcontext.Matrix)2 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)2 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)2 File (java.io.File)1