Search in sources :

Example 26 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class PiApp method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("JavaSparkPi").master("spark://10.0.100.81:7077").config("spark.executor.memory", "1g").config("spark.executor.cores", "1").config("spark.cores.max", "2").config("spark.driver.host", "10.0.100.182").config("spark.executor.extraClassPath", "/home/jgp/net.jgp.labs.spark/target/labs-spark-2.2.0-jar-with-dependencies.jar").getOrCreate();
    JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
    int n = 1 * NUM_SAMPLES;
    List<Integer> l = new ArrayList<>(n);
    for (int i = 0; i < n; i++) {
        l.add(i);
    }
    JavaRDD<Integer> dataSet = jsc.parallelize(l, NUM_SAMPLES);
    long t0 = System.currentTimeMillis();
    long count = dataSet.map(integer -> {
        double x = Math.random() * 2 - 1;
        double y = Math.random() * 2 - 1;
        return (x * x + y * y <= 1) ? 1 : 0;
    }).reduce((integer, integer2) -> integer + integer2);
    long t1 = System.currentTimeMillis();
    log.info("Pi is roughly ..... {}", 4.0 * count / n);
    log.info("Processing time ... {} ms", t1 - t0);
    spark.stop();
}
Also used : List(java.util.List) Logger(org.slf4j.Logger) SubStringCounterDataSource(net.jgp.labs.spark.x.datasource.SubStringCounterDataSource) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) JavaRDD(org.apache.spark.api.java.JavaRDD) ArrayList(java.util.ArrayList) SparkSession(org.apache.spark.sql.SparkSession) SparkSession(org.apache.spark.sql.SparkSession) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 27 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class Reader method start.

private void start() {
    SparkConf conf = new SparkConf().setAppName("Concurrency Lab 001").setMaster(Config.MASTER);
    JavaSparkContext sc = new JavaSparkContext(conf);
    SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
    conf = spark.sparkContext().conf();
    System.out.println(conf.get("hello"));
    Dataset<Row> df = spark.sql("SELECT * from myView");
    df.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 28 with SparkSession

use of org.apache.spark.sql.SparkSession in project incubator-systemml by apache.

the class FrameConverterTest method runConverter.

@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
    SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
    JavaSparkContext sc = sec.getSparkContext();
    ValueType[] lschema = schema.toArray(new ValueType[0]);
    MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
    switch(type) {
        case CSV2BIN:
            {
                InputInfo iinfo = InputInfo.CSVInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2CSV:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                CSVFileFormatProperties fprop = new CSVFileFormatProperties();
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case TXTCELL2BIN:
            {
                InputInfo iinfo = InputInfo.TextCellInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2TXTCELL:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case MAT2BIN:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2MAT:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
                break;
            }
        case DFRM2BIN:
            {
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                // Create DataFrame
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
                JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
                Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2DFRM:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
                // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        default:
            throw new RuntimeException("Unsuported converter type: " + type.toString());
    }
    sec.close();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Dataset(org.apache.spark.sql.Dataset) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritableFrameToLongFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongWritableFrameToLongFrameFunction) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction)

Example 29 with SparkSession

use of org.apache.spark.sql.SparkSession in project incubator-systemml by apache.

the class MLContextScratchCleanupTest method runMLContextTestMultipleScript.

private static void runMLContextTestMultipleScript(RUNTIME_PLATFORM platform, boolean wRead) {
    RUNTIME_PLATFORM oldplatform = DMLScript.rtplatform;
    DMLScript.rtplatform = platform;
    // create mlcontext
    SparkSession spark = createSystemMLSparkSession("MLContextScratchCleanupTest", "local");
    MLContext ml = new MLContext(spark);
    ml.setExplain(true);
    String dml1 = baseDirectory + File.separator + "ScratchCleanup1.dml";
    String dml2 = baseDirectory + File.separator + (wRead ? "ScratchCleanup2b.dml" : "ScratchCleanup2.dml");
    try {
        Script script1 = dmlFromFile(dml1).in("$rows", rows).in("$cols", cols).out("X");
        Matrix X = ml.execute(script1).getMatrix("X");
        // clear in-memory/cached data to emulate on-disk storage
        X.toMatrixObject().clearData();
        Script script2 = dmlFromFile(dml2).in("X", X).out("z");
        String z = ml.execute(script2).getString("z");
        System.out.println(z);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        DMLScript.rtplatform = oldplatform;
        // stop underlying spark context to allow single jvm tests (otherwise the
        // next test that tries to create a SparkContext would fail)
        spark.stop();
        // clear status mlcontext and spark exec context
        ml.close();
    }
}
Also used : RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) Script(org.apache.sysml.api.mlcontext.Script) DMLScript(org.apache.sysml.api.DMLScript) SparkSession(org.apache.spark.sql.SparkSession) Matrix(org.apache.sysml.api.mlcontext.Matrix) MLContext(org.apache.sysml.api.mlcontext.MLContext)

Example 30 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class S3CsvToDataset2 method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV on S3 to Dataset<Row>").master("spark://10.0.100.81:7077").config("spark.executor.memory", "1g").config("spark.executor.cores", "1").config("spark.cores.max", "2").config("spark.driver.host", "10.0.100.182").config("spark.executor.extraClassPath", "/home/jgp/net.jgp.labs.spark/target/labs-spark-2.2.0-jar-with-dependencies.jar").getOrCreate();
    spark.sparkContext().hadoopConfiguration().set("fs.s3a.access.key", "xxx");
    spark.sparkContext().hadoopConfiguration().set("fs.s3a.secret.key", "xxx");
    // spark.sparkContext().hadoopConfiguration().set("fs.s3n.endpoint",
    // "us-east-2");
    String bucket = "bucket_name";
    String key = "key";
    String filename = "s3a://" + bucket + "/" + key;
    Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "false").option("sep", "|").load(filename);
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Aggregations

SparkSession (org.apache.spark.sql.SparkSession)53 Row (org.apache.spark.sql.Row)43 StructType (org.apache.spark.sql.types.StructType)11 ArrayList (java.util.ArrayList)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 StructField (org.apache.spark.sql.types.StructField)6 SparkConf (org.apache.spark.SparkConf)4 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Script (org.apache.sysml.api.mlcontext.Script)3 Test (org.junit.Test)3 Dataset (org.apache.spark.sql.Dataset)2 StreamingQuery (org.apache.spark.sql.streaming.StreamingQuery)2 StreamingQueryException (org.apache.spark.sql.streaming.StreamingQueryException)2 DMLScript (org.apache.sysml.api.DMLScript)2 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)2 MLContext (org.apache.sysml.api.mlcontext.MLContext)2 Matrix (org.apache.sysml.api.mlcontext.Matrix)2 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)2 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)2 File (java.io.File)1